From de2bfe902936e3f7abfd4b55ad1149f75c5818b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ludovic=20Court=C3=A8s?= Date: Mon, 19 Nov 2018 15:40:21 +0100 Subject: [PATCH] Add (guix swh). * guix/swh.scm: New file. * Makefile.am (MODULES): Add it. --- Makefile.am | 1 + guix/swh.scm | 551 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 552 insertions(+) create mode 100644 guix/swh.scm diff --git a/Makefile.am b/Makefile.am index 69e66fad75..c5676b0b94 100644 --- a/Makefile.am +++ b/Makefile.am @@ -75,6 +75,7 @@ MODULES = \ guix/discovery.scm \ guix/git-download.scm \ guix/hg-download.scm \ + guix/swh.scm \ guix/monads.scm \ guix/monad-repl.scm \ guix/gexp.scm \ diff --git a/guix/swh.scm b/guix/swh.scm new file mode 100644 index 0000000000..c188e17c69 --- /dev/null +++ b/guix/swh.scm @@ -0,0 +1,551 @@ +;;; GNU Guix --- Functional package management for GNU +;;; Copyright © 2018 Ludovic Courtès +;;; +;;; This file is part of GNU Guix. +;;; +;;; GNU Guix is free software; you can redistribute it and/or modify it +;;; under the terms of the GNU General Public License as published by +;;; the Free Software Foundation; either version 3 of the License, or (at +;;; your option) any later version. +;;; +;;; GNU Guix is distributed in the hope that it will be useful, but +;;; WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with GNU Guix. If not, see . + +(define-module (guix swh) + #:use-module (guix base16) + #:use-module (guix build utils) + #:use-module ((guix build syscalls) #:select (mkdtemp!)) + #:use-module (web client) + #:use-module (web response) + #:use-module (json) + #:use-module (srfi srfi-1) + #:use-module (srfi srfi-9) + #:use-module (srfi srfi-11) + #:use-module (srfi srfi-19) + #:use-module (ice-9 match) + #:use-module (ice-9 regex) + #:use-module (ice-9 popen) + #:use-module ((ice-9 ftw) #:select (scandir)) + #:export (origin? + origin-id + origin-type + origin-url + origin-visits + lookup-origin + + visit? + visit-date + visit-origin + visit-url + visit-snapshot-url + visit-status + visit-number + visit-snapshot + + branch? + branch-name + branch-target + + release? + release-id + release-name + release-message + release-target + + revision? + revision-id + revision-date + revision-directory + lookup-revision + lookup-origin-revision + + content? + content-checksums + content-data-url + content-length + lookup-content + + directory-entry? + directory-entry-name + directory-entry-type + directory-entry-checksums + directory-entry-length + directory-entry-permissions + lookup-directory + directory-entry-target + + vault-reply? + vault-reply-id + vault-reply-fetch-url + vault-reply-object-id + vault-reply-object-type + vault-reply-progress-message + vault-reply-status + query-vault + request-cooking + vault-fetch + + swh-download)) + +;;; Commentary: +;;; +;;; This module provides bindings to the HTTP interface of Software Heritage. +;;; It allows you to browse the archive, look up revisions (such as SHA1 +;;; commit IDs), "origins" (code hosting URLs), content (files), etc. See +;;; for more information. +;;; +;;; The high-level 'swh-download' procedure allows you to download a Git +;;; revision from Software Heritage, provided it is available. +;;; +;;; Code: + +(define %swh-base-url + ;; Presumably we won't need to change it. + "https://archive.softwareheritage.org") + +(define (swh-url path . rest) + (define url + (string-append %swh-base-url path + (string-join rest "/" 'prefix))) + + ;; Ensure there's a trailing slash or we get a redirect. + (if (string-suffix? "/" url) + url + (string-append url "/"))) + +(define-syntax-rule (define-json-reader json->record ctor spec ...) + "Define JSON->RECORD as a procedure that converts a JSON representation, +read from a port, string, or hash table, into a record created by CTOR and +following SPEC, a series of field specifications." + (define (json->record input) + (let ((table (cond ((port? input) + (json->scm input)) + ((string? input) + (json-string->scm input)) + ((hash-table? input) + input)))) + (let-syntax ((extract-field (syntax-rules () + ((_ table (field key json->value)) + (json->value (hash-ref table key))) + ((_ table (field key)) + (hash-ref table key)) + ((_ table (field)) + (hash-ref table + (symbol->string 'field)))))) + (ctor (extract-field table spec) ...))))) + +(define-syntax-rule (define-json-mapping rtd ctor pred json->record + (field getter spec ...) ...) + "Define RTD as a record type with the given FIELDs and GETTERs, à la SRFI-9, +and define JSON->RECORD as a conversion from JSON to a record of this type." + (begin + (define-record-type rtd + (ctor field ...) + pred + (field getter) ...) + + (define-json-reader json->record ctor + (field spec ...) ...))) + +(define %date-regexp + ;; Match strings like "2014-11-17T22:09:38+01:00" or + ;; "2018-09-30T23:20:07.815449+00:00"". + (make-regexp "^([0-9]{4})-([0-9]{2})-([0-9]{2})T([0-9]{2}):([0-9]{2}):([0-9]{2})((\\.[0-9]+)?)([+-][0-9]{2}):([0-9]{2})$")) + +(define (string->date* str) + "Return a SRFI-19 date parsed from STR, a date string as returned by +Software Heritage." + ;; We can't use 'string->date' because of the timezone format: SWH returns + ;; "+01:00" when the '~z' template expects "+0100". So we roll our own! + (or (and=> (regexp-exec %date-regexp str) + (lambda (match) + (define (ref n) + (string->number (match:substring match n))) + + (make-date (let ((ns (match:substring match 8))) + (if ns + (string->number (string-drop ns 1)) + 0)) + (ref 6) (ref 5) (ref 4) + (ref 3) (ref 2) (ref 1) + (+ (* 3600 (ref 9)) ;time zone + (if (< (ref 9) 0) + (- (ref 10)) + (ref 10)))))) + str)) ;oops! + +(define* (call url decode #:optional (method http-get) + #:key (false-if-404? #t)) + "Invoke the endpoint at URL using METHOD. Decode the resulting JSON body +using DECODE, a one-argument procedure that takes an input port. When +FALSE-IF-404? is true, return #f upon 404 responses." + (let*-values (((response port) + (method url #:streaming? #t))) + ;; See . + (match (assq-ref (response-headers response) 'x-ratelimit-remaining) + (#f #t) + ((? (compose zero? string->number)) + (throw 'swh-error url response)) + (_ #t)) + + (cond ((= 200 (response-code response)) + (let ((result (decode port))) + (close-port port) + result)) + ((and false-if-404? + (= 404 (response-code response))) + (close-port port) + #f) + (else + (close-port port) + (throw 'swh-error url response))))) + +(define-syntax define-query + (syntax-rules (path) + "Define a procedure that performs a Software Heritage query." + ((_ (name args ...) docstring (path components ...) + json->value) + (define (name args ...) + docstring + (call (swh-url components ...) json->value))))) + +;; +(define-json-mapping make-origin origin? + json->origin + (id origin-id) + (visits-url origin-visits-url "origin_visits_url") + (type origin-type) + (url origin-url)) + +;; +(define-json-mapping make-visit visit? + json->visit + (date visit-date "date" string->date*) + (origin visit-origin) + (url visit-url "origin_visit_url") + (snapshot-url visit-snapshot-url "snapshot_url") + (status visit-status) + (number visit-number "visit")) + +;; +(define-json-mapping make-snapshot snapshot? + json->snapshot + (branches snapshot-branches "branches" json->branches)) + +;; This is used for the "branches" field of snapshots. +(define-record-type + (make-branch name target-type target-url) + branch? + (name branch-name) + (target-type branch-target-type) ;release | revision + (target-url branch-target-url)) + +(define (json->branches branches) + (hash-map->list (lambda (key value) + (make-branch key + (string->symbol + (hash-ref value "target_type")) + (hash-ref value "target_url"))) + branches)) + +;; +(define-json-mapping make-release release? + json->release + (id release-id) + (name release-name) + (message release-message) + (target-type release-target-type "target_type" string->symbol) + (target-url release-target-url "target_url")) + +;; +(define-json-mapping make-revision revision? + json->revision + (id revision-id) + (date revision-date "date" string->date*) + (directory revision-directory) + (directory-url revision-directory-url "directory_url")) + +;; +(define-json-mapping make-content content? + json->content + (checksums content-checksums "checksums" json->checksums) + (data-url content-data-url "data_url") + (file-type-url content-file-type-url "filetype_url") + (language-url content-language-url "language_url") + (length content-length) + (license-url content-license-url "license_url")) + +(define (json->checksums checksums) + (hash-map->list (lambda (key value) + (cons key (base16-string->bytevector value))) + checksums)) + +;; +(define-json-mapping make-directory-entry directory-entry? + json->directory-entry + (name directory-entry-name) + (type directory-entry-type "type" + (match-lambda + ("dir" 'directory) + (str (string->symbol str)))) + (checksums directory-entry-checksums "checksums" + (match-lambda + (#f #f) + (lst (json->checksums lst)))) + (id directory-entry-id "dir_id") + (length directory-entry-length) + (permissions directory-entry-permissions "perms") + (target-url directory-entry-target-url "target_url")) + +;; +(define-json-mapping make-save-reply save-reply? + json->save-reply + (origin-url save-reply-origin-url "origin_url") + (origin-type save-reply-origin-type "origin_type") + (request-date save-reply-request-date "save_request_date" + string->date*) + (request-status save-reply-request-status "save_request_status" + string->symbol) + (task-status save-reply-task-status "save_task_status" + (match-lambda + ("not created" 'not-created) + ((? string? str) (string->symbol str))))) + +;; +(define-json-mapping make-vault-reply vault-reply? + json->vault-reply + (id vault-reply-id) + (fetch-url vault-reply-fetch-url "fetch_url") + (object-id vault-reply-object-id "obj_id") + (object-type vault-reply-object-type "obj_type" string->symbol) + (progress-message vault-reply-progress-message "progress_message") + (status vault-reply-status "status" string->symbol)) + + +;;; +;;; RPCs. +;;; + +(define-query (lookup-origin url) + "Return an origin for URL." + (path "/api/1/origin/git/url" url) + json->origin) + +(define-query (lookup-content hash type) + "Return a content for HASH, of the given TYPE--e.g., \"sha256\"." + (path "/api/1/content" + (string-append type ":" + (bytevector->base16-string hash))) + json->content) + +(define-query (lookup-revision id) + "Return the revision with the given ID, typically a Git commit SHA1." + (path "/api/1/revision" id) + json->revision) + +(define-query (lookup-directory id) + "Return the directory with the given ID." + (path "/api/1/directory" id) + json->directory-entries) + +(define (json->directory-entries port) + (map json->directory-entry (json->scm port))) + +(define (origin-visits origin) + "Return the list of visits of ORIGIN, a record as returned by +'lookup-origin'." + (call (swh-url (origin-visits-url origin)) + (lambda (port) + (map json->visit (json->scm port))))) + +(define (visit-snapshot visit) + "Return the snapshot corresponding to VISIT." + (call (swh-url (visit-snapshot-url visit)) + json->snapshot)) + +(define (branch-target branch) + "Return the target of BRANCH, either a or a ." + (match (branch-target-type branch) + ('release + (call (swh-url (branch-target-url branch)) + json->release)) + ('revision + (call (swh-url (branch-target-url branch)) + json->revision)))) + +(define (lookup-origin-revision url tag) + "Return a corresponding to the given TAG for the repository +coming from URL. Example: + + (lookup-origin-release \"https://github.com/guix-mirror/guix/\" \"v0.8\") + => #< id: \"44941…\" …> + +The information is based on the latest visit of URL available. Return #f if +URL could not be found." + (match (lookup-origin url) + (#f #f) + (origin + (match (origin-visits origin) + ((visit . _) + (let ((snapshot (visit-snapshot visit))) + (match (and=> (find (lambda (branch) + (string=? (string-append "refs/tags/" tag) + (branch-name branch))) + (snapshot-branches snapshot)) + branch-target) + ((? release? release) + (release-target release)) + ((? revision? revision) + revision) + (#f ;tag not found + #f)))) + (() + #f))))) + +(define (release-target release) + "Return the revision that is the target of RELEASE." + (match (release-target-type release) + ('revision + (call (swh-url (release-target-url release)) + json->revision)))) + +(define (directory-entry-target entry) + "If ENTRY, a directory entry, has type 'directory, return its list of +directory entries; if it has type 'file, return its object." + (call (swh-url (directory-entry-target-url entry)) + (match (directory-entry-type entry) + ('file json->content) + ('directory json->directory-entries)))) + +(define* (save-origin url #:optional (type "git")) + "Request URL to be saved." + (call (swh-url "/api/1/origin/save" type "url" url) json->save-reply + http-post)) + +(define-query (save-origin-status url type) + "Return the status of a /save request for URL and TYPE (e.g., \"git\")." + (path "/api/1/origin/save" type "url" url) + json->save-reply) + +(define-query (query-vault id kind) + "Ask the availability of object ID and KIND to the vault, where KIND is +'directory or 'revision. Return #f if it could not be found, or a + on success." + ;; + ;; There's a single format supported for directories and revisions and for + ;; now, the "/format" bit of the URL *must* be omitted. + (path "/api/1/vault" (symbol->string kind) id) + json->vault-reply) + +(define (request-cooking id kind) + "Request the cooking of object ID and KIND (one of 'directory or 'revision) +to the vault. Return a ." + (call (swh-url "/api/1/vault" (symbol->string kind) id) + json->vault-reply + http-post)) + +(define* (vault-fetch id kind + #:key (log-port (current-error-port))) + "Return an input port from which a bundle of the object with the given ID +and KIND (one of 'directory or 'revision) can be retrieved, or #f if the +object could not be found. + +For a directory, the returned stream is a gzip-compressed tarball. For a +revision, it is a gzip-compressed stream for 'git fast-import'." + (let loop ((reply (query-vault id kind))) + (match reply + (#f + (and=> (request-cooking id kind) loop)) + (_ + (match (vault-reply-status reply) + ('done + ;; Fetch the bundle. + (let-values (((response port) + (http-get (swh-url (vault-reply-fetch-url reply)) + #:streaming? #t))) + (if (= (response-code response) 200) + port + (begin ;shouldn't happen + (close-port port) + #f)))) + ('failed + ;; Upon failure, we're supposed to try again. + (format log-port "SWH vault: failure: ~a~%" + (vault-reply-progress-message reply)) + (format log-port "SWH vault: retrying...~%") + (loop (request-cooking id kind))) + ((and (or 'new 'pending) status) + ;; Wait until the bundle shows up. + (let ((message (vault-reply-progress-message reply))) + (when (eq? 'new status) + (format log-port "SWH vault: \ +requested bundle cooking, waiting for completion...~%")) + (when (string? message) + (format log-port "SWH vault: ~a~%" message)) + + ;; Wait long enough so we don't exhaust our maximum number of + ;; requests per hour too fast (as of this writing, the limit is 60 + ;; requests per hour per IP address.) + (sleep (if (eq? status 'new) 60 30)) + + (loop (query-vault id kind))))))))) + + +;;; +;;; High-level interface. +;;; + +(define (commit-id? reference) + "Return true if REFERENCE is likely a commit ID, false otherwise---e.g., if +it is a tag name." + (and (= (string-length reference) 40) + (string-every char-set:hex-digit reference))) + +(define (call-with-temporary-directory proc) ;FIXME: factorize + "Call PROC with a name of a temporary directory; close the directory and +delete it when leaving the dynamic extent of this call." + (let* ((directory (or (getenv "TMPDIR") "/tmp")) + (template (string-append directory "/guix-directory.XXXXXX")) + (tmp-dir (mkdtemp! template))) + (dynamic-wind + (const #t) + (lambda () + (proc tmp-dir)) + (lambda () + (false-if-exception (delete-file-recursively tmp-dir)))))) + +(define (swh-download url reference output) + "Download from Software Heritage a checkout of the Git tag or commit +REFERENCE originating from URL, and unpack it in OUTPUT. Return #t on success +and #f on failure. + +This procedure uses the \"vault\", which contains \"cooked\" directories in +the form of tarballs. If the requested directory is not cooked yet, it will +wait until it becomes available, which could take several minutes." + (match (if (commit-id? reference) + (lookup-revision reference) + (lookup-origin-revision url reference)) + ((? revision? revision) + (call-with-temporary-directory + (lambda (directory) + (let ((input (vault-fetch (revision-directory revision) 'directory)) + (tar (open-pipe* OPEN_WRITE "tar" "-C" directory "-xzvf" "-"))) + (dump-port input tar) + (close-port input) + (let ((status (close-pipe tar))) + (unless (zero? status) + (error "tar extraction failure" status))) + + (match (scandir directory) + (("." ".." sub-directory) + (copy-recursively (string-append directory "/" sub-directory) + output + #:log (%make-void-port "w")) + #t)))))) + (#f + #f)))