Skip to content

Instantly share code, notes, and snippets.

@ericnormand
Last active December 16, 2019 16:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ericnormand/704f07e8aa041edace319afead3aac73 to your computer and use it in GitHub Desktop.
Save ericnormand/704f07e8aa041edace319afead3aac73 to your computer and use it in GitHub Desktop.

Levenshtein distance

The Levenshtein distance measures the edit distance between two strings. That is, how many one-character changes do you need to make between two strings to turn one into the other. The algorithm has a nice recursive definition on Wikipedia, which makes it easy to write in Clojure.

Your goal is to implement the Levenshtein distance as a function of two strings. It should return the edit distance.

Bonus: use memoization to make it more efficient, or use an iterative method.

(defn lev
([a b]
(let [f (memoize lev)]
(f f a b (count a) (count b))))
([f a b i j]
(cond
(zero? i)
j
(zero? j)
i
:else
(min
(inc (f f a b (dec i) j))
(inc (f f a b i (dec j)))
(+ (if (= (.charAt a (dec i))
(.charAt b (dec j)))
0
1)
(f f a b (dec i) (dec j)))))))
(defn levdist
[s slen t tlen]
(let [cost (if (= (get s (dec slen)) (get t (dec tlen))) 0 1)]
(if (zero? slen) tlen
(if (zero? tlen) slen
(min (inc (levdist s (dec slen) t tlen))
(inc (levdist s slen t (dec tlen)))
(+ (levdist s (dec slen) t (dec tlen)) cost))))))
;; memoized levdist
(def memlevdist
(memoize
(fn [s slen t tlen]
(let [cost (if (= (get s (dec slen)) (get t (dec tlen))) 0 1)]
(if (zero? slen) tlen
(if (zero? tlen) slen
(min (inc (memlevdist s (dec slen) t tlen))
(inc (memlevdist s slen t (dec tlen)))
(+ (memlevdist s (dec slen) t (dec tlen)) cost))))))))
(def s1 "weholdthese")
(def s2 "toboldlygo")
;; regular
(time (levdist s1 (count s1) s2 (count s2)))
;; "Elapsed time: 11066.254867 msecs"
;; 8
;; memoized
(time (memlevdist s1 (count s1) s2 (count s2)))
;; "Elapsed time: 0.802101 msecs"
;; 8
;; Note, a 2nd run of memlevdist with the same inputs runs even faster of course. About .05 msecs.
(defn leven-slow [xs ys]
(cond (= xs ys) 0
(empty? xs) (count ys)
(empty? ys) (count xs)
:else (min (inc (leven-slow (rest xs) ys))
(inc (leven-slow xs (rest ys)))
(+ (if (= (first xs) (first ys)) 0 1)
(leven-slow (rest xs) (rest ys))))))
;; Memoized recursive function, a mash-up of memoize and fn
(defmacro mrfn
"Returns an anonymous function like `fn` but recursive calls to the given `name` within
`body` use a memoized version of the function, potentially improving performance (see
`memoize`). Only simple argument symbols are supported, not varargs or destructing or
multiple arities. Memoized recursion requires explicit calls to `name` so the `body`
should not use recur to the top level."
[name args & body]
{:pre [(simple-symbol? name) (vector? args) (seq args) (every? simple-symbol? args)]}
(let [akey (if (= (count args) 1) (first args) args)]
;; name becomes extra arg to support recursive memoized calls
`(let [f# (fn [~name ~@args] ~@body)
mem# (atom {})]
(fn mr# [~@args]
(if-let [e# (find @mem# ~akey)]
(val e#)
(let [ret# (f# mr# ~@args)]
(swap! mem# assoc ~akey ret#)
ret#))))))
(defn levenshtein
"returns the Levenshtein distance between the strings a and b"
[a b]
(let [mrlev (mrfn lev [xs ys]
(cond (empty? xs) (count ys)
(empty? ys) (count xs)
:else (min (inc (lev (rest xs) ys))
(inc (lev xs (rest ys)))
(+ (if (= (first xs) (first ys)) 0 1)
(lev (rest xs) (rest ys))))))]
(if (= a b)
0
(mrlev (seq a) (seq b)))))
(defn smoke-test-lev
([] (smoke-test-lev levenshtein))
([lev]
(assert (zero? (lev "" "")))
(assert (= 1 (lev "test" "tent")))
(assert (= 1 (lev "tent" "test")))
(assert (= 2 (lev "lawn" "flaw")))
(assert (= 2 (lev "flaw" "lawn")))
(assert (= 3 (lev "kitten" "sitting")))
(assert (= 3 (lev "sitting" "kitten")))
(assert (zero? (lev "levenshtein" "levenshtein")))
(assert (= 1 (lev "blevenshtein" "alevenshtein")))
(assert (= 2 (lev "levenshtein" "levenshtien")))
true))
#_ (quick-bench (smoke-test-lev leven-slow))
;; Execution time mean : 14.2 sec
#_ (quick-bench (smoke-test-lev levenshtein))
;; Execution time mean : 1.25 ms
(defn lev2 [a b]
(let [f (mrfn f [i j]
(cond
(zero? i) j
(zero? j) i
:else (min
(inc (f (dec i) j))
(inc (f i (dec j)))
(+ (if (= (.charAt ^String a (dec i))
(.charAt ^String b (dec j)))
0
1)
(f (dec i) (dec j))))))]
(f (count a) (count b))))
(ns levenshtein.core
(:require
[clojure.string :as str]))
(defn without-last-char
"Returns the given word without the last char"
[x]
(cond
(str/blank? x) x
:else (subs x 0 (dec (count x)))))
(defn last-equal?
"Are the last chars of two words are equal?"
[a b]
(= (take-last 1 a) (take-last 1 b)))
(defn f
"Levenshtein distance between two words"
([[a b]] (f a b))
([a b]
(cond
(str/blank? a) (count b)
(str/blank? b) (count a)
:else (min (inc (f (without-last-char a) b))
(inc (f a (without-last-char b)))
(+ (f (without-last-char a) (without-last-char b))
(if (last-equal? a b) 0 1))))))
(def levenshtein-distance f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment