From 9505b54a4f9f0265c9d8be53763f0c59d6f62a44 Mon Sep 17 00:00:00 2001 From: zimoun Date: Tue, 19 Jan 2021 22:28:08 +0100 Subject: [PATCH] utils: Add string distance. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * guix/utils.scm (string-distance): New procedure. (string-closest): New procedure. * tests/utils.scm ("string-distance", "string-closest"): New tests. Signed-off-by: Ludovic Courtès --- guix/utils.scm | 47 ++++++++++++++++++++++++++++++++++++++++++++++- tests/utils.scm | 18 ++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/guix/utils.scm b/guix/utils.scm index a85e2f495c..1625cab19b 100644 --- a/guix/utils.scm +++ b/guix/utils.scm @@ -8,6 +8,7 @@ ;;; Copyright © 2017 Mathieu Othacehe ;;; Copyright © 2018, 2020 Marius Bakke ;;; Copyright © 2020 Efraim Flashner +;;; Copyright © 2021 Simon Tournier ;;; ;;; This file is part of GNU Guix. ;;; @@ -37,6 +38,7 @@ (define-module (guix utils) #:use-module (guix memoization) #:use-module ((guix build utils) #:select (dump-port mkdir-p delete-file-recursively)) #:use-module ((guix build syscalls) #:select (mkdtemp! fdatasync)) + #:use-module ((guix combinators) #:select (fold2)) #:use-module (guix diagnostics) ;, &error-location, etc. #:use-module (ice-9 format) #:use-module (ice-9 regex) @@ -115,7 +117,10 @@ (define-module (guix utils) call-with-decompressed-port compressed-output-port call-with-compressed-output-port - canonical-newline-port)) + canonical-newline-port + + string-distance + string-closest)) ;;; @@ -880,6 +885,46 @@ (define-syntax current-source-directory ;; raising an error would upset Geiser users #f)))))) + +;;; +;;; String comparison. +;;; + +(define (string-distance s1 s2) + "Compute the Levenshtein distance between two strings." + ;; Naive implemenation + (define loop + (mlambda (as bt) + (match as + (() (length bt)) + ((a s ...) + (match bt + (() (length as)) + ((b t ...) + (if (char=? a b) + (loop s t) + (1+ (min + (loop as t) + (loop s bt) + (loop s t)))))))))) + + (let ((c1 (string->list s1)) + (c2 (string->list s2))) + (loop c1 c2))) + +(define* (string-closest trial tests #:key (threshold 3)) + "Return the string from TESTS that is the closest from the TRIAL, +according to 'string-distance'. If the TESTS are too far from TRIAL, +according to THRESHOLD, then #f is returned." + (identity ;discard second return value + (fold2 (lambda (test closest minimal) + (let ((dist (string-distance trial test))) + (if (and (< dist minimal) (< dist threshold)) + (values test dist) + (values closest minimal)))) + #f +inf.0 + tests))) + ;;; Local Variables: ;;; eval: (put 'call-with-progress-reporter 'scheme-indent-function 1) ;;; End: diff --git a/tests/utils.scm b/tests/utils.scm index 62ec7e8b4c..7fcbb25552 100644 --- a/tests/utils.scm +++ b/tests/utils.scm @@ -2,6 +2,7 @@ ;;; Copyright © 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021 Ludovic Courtès ;;; Copyright © 2014 Eric Bavier ;;; Copyright © 2016 Mathieu Lirzin +;;; Copyright © 2021 Simon Tournier ;;; ;;; This file is part of GNU Guix. ;;; @@ -271,6 +272,23 @@ (define (test-compression/decompression method run?) string-reverse) (call-with-input-file temp-file get-string-all))) +(test-equal "string-distance" + '(0 1 1 5 5) + (list + (string-distance "hello" "hello") + (string-distance "hello" "helo") + (string-distance "helo" "hello") + (string-distance "" "hello") + (string-distance "hello" ""))) + +(test-equal "string-closest" + '("hello" "hello" "helo" #f) + (list + (string-closest "hello" '("hello")) + (string-closest "hello" '("helo" "hello" "halo")) + (string-closest "hello" '("kikoo" "helo" "hihihi" "halo")) + (string-closest "hello" '("aaaaa" "12345" "hellohello" "h")))) + (test-end) (false-if-exception (delete-file temp-file))