Last active
July 10, 2018 15:16
-
-
Save regiskuckaertz/caf4839430de259f8e3f8605d58280ad to your computer and use it in GitHub Desktop.
Edit distance with the Wagner-Fischer algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Levenshtein where | |
pairs :: [a] -> [(a,a)] | |
pairs [x,y] = [(x,y)] | |
pairs (x:y:xs) = (x,y) : pairs (y:xs) | |
vowels :: String | |
vowels = "aeiou" | |
cost :: Rational -> Rational -> Rational -> Char -> Char -> Rational | |
cost a b c s t | s == t = b | |
| otherwise = (a + insert t) `min` (b + subst s t) `min` (c + delete s) | |
-- Cost of substitution | |
subst :: Char -> Char -> Rational | |
subst s t | s `elem` vowels = 0.5 | |
| t `elem` vowels = 0.5 | |
| otherwise = 1 | |
-- Cost of insertion/removal | |
insert, delete :: Char -> Rational | |
insert s | s `elem` vowels = 0.5 | |
| otherwise = 1 | |
delete = insert | |
-- Coefficients for row `i` of the matrix given coefficients for row `i-1` | |
costs :: [Rational] -> Rational -> Char -> String -> [Rational] | |
costs row i s ts = reverse . foldl f [i] $ xss | |
where f (a:as) ((b,c), t) = cost a b c s t : (a:as) | |
xss = zip (pairs row) ts | |
levenshtein :: String -> String -> Double | |
levenshtein "" ts = fromRational . sum . map insert $ ts | |
levenshtein ss "" = fromRational . sum . map insert $ ss | |
levenshtein ss ts = fromRational . last $ row | |
where row = foldl f row0 (reverse $ foldl h [(0,'-')] ss) | |
row0 = reverse $ foldl g [0] ts | |
f row (i,s) = costs row i s ts | |
g l@(x:xs) t = x + delete t : l | |
h l@((x,_):xs) s = (x + insert s, s) : l | |
-- The cost matrix of `levenshtein "Pneumonoultramicroscopicsilicovolcanoconiosis" "Pseudopseudohypoparathyroidism" | |
-- | |
-- P s e u d o p s e u d o h y p o p a r a t h y r o i d i s m | |
-- 0 1 2 2.5 3 4 4.5 5.5 6.5 7 7.5 8.5 9 10 11 12 12.5 13.5 14 15 15.5 16.5 17.5 18.5 19.5 20 20.5 21.5 22 23 24 | |
-- P 1 0 1 1.5 2 3 3.5 4.5 5.5 6 6.5 7.5 8 9 10 11 11.5 12.5 13 14 14.5 15.5 16.5 17.5 18.5 19 19.5 20.5 21 22 23 | |
-- n 2 1 1 1.5 2 3 3.5 4.5 5.5 6 6.5 7.5 8 9 10 11 11.5 12.5 13 14 14.5 15.5 16.5 17.5 18.5 19 19.5 20.5 21 22 23 | |
-- e 2.5 1.5 1.5 1 1.5 2.5 3 4 5 5.5 6 7 7.5 8.5 9.5 10.5 11 12 12.5 13.5 14 15 16 17 18 18.5 19 20 20.5 21.5 22.5 | |
-- u 3 2 2 1.5 1 2 2.5 3.5 4.5 5 5.5 6.5 7 8 9 10 10.5 11.5 12 13 13.5 14.5 15.5 16.5 17.5 18 18.5 19.5 20 21 22 | |
-- m 4 3 3 2.5 2 2 2.5 3.5 4.5 5 5.5 6.5 7 8 9 10 10.5 11.5 12 13 13.5 14.5 15.5 16.5 17.5 18 18.5 19.5 20 21 21 | |
-- o 4.5 3.5 3.5 3 2.5 2.5 2 3 4 4.5 5 6 6.5 7.5 8.5 9.5 10 11 11.5 12.5 13 14 15 16 17 17.5 18 19 19.5 20.5 21.5 | |
-- n 5.5 4.5 4.5 4 3.5 3.5 3 3 4 4.5 5 6 6.5 7.5 8.5 9.5 10 11 11.5 12.5 13 14 15 16 17 17.5 18 19 19.5 20.5 21.5 | |
-- o 6 5 5 4.5 4 4 3.5 3.5 3.5 4 4.5 5.5 6 7 8 9 9.5 10.5 11 12 12.5 13.5 14.5 15.5 16.5 17 17.5 18.5 19 20 21 | |
-- u 6.5 5.5 5.5 5 4.5 4.5 4 4 4 4 4 5 5.5 6.5 7.5 8.5 9 10 10.5 11.5 12 13 14 15 16 16.5 17 18 18.5 19.5 20.5 | |
-- l 7.5 6.5 6.5 6 5.5 5.5 5 5 5 4.5 4.5 5 5.5 6.5 7.5 8.5 9 10 10.5 11.5 12 13 14 15 16 16.5 17 18 18.5 19.5 20.5 | |
-- t 8.5 7.5 7.5 7 6.5 6.5 6 6 6 5.5 5 5.5 5.5 6.5 7.5 8.5 9 10 10.5 11.5 12 12 13 14 15 15.5 16 17 17.5 18.5 19.5 | |
-- r 9.5 8.5 8.5 8 7.5 7.5 7 7 7 6.5 6 6 6 6.5 7.5 8.5 9 10 10.5 10.5 11 12 13 14 14 14.5 15 16 16.5 17.5 18.5 | |
-- a 10 9 9 8.5 8 8 7.5 7.5 7.5 7 6.5 6.5 6.5 6.5 7 8 8.5 9.5 10 11 10.5 11.5 12.5 13.5 14.5 14.5 15 15.5 16 17 18 | |
-- m 11 10 10 9.5 9 9 8.5 8.5 8.5 8 7.5 7.5 7 7.5 7.5 8 8.5 9.5 10 11 11.5 11.5 12.5 13.5 14.5 15 15 16 16 17 17 | |
-- i 11.5 10.5 10.5 10 9.5 9.5 9 9 9 8.5 8 8 7.5 7.5 8 8 8.5 9 9.5 10.5 11 12 12 13 14 14.5 15 15.5 16 16.5 17.5 | |
-- c 12.5 11.5 11.5 11 10.5 10.5 10 10 10 9.5 9 9 8.5 8.5 8.5 9 8.5 9.5 9.5 10.5 11 12 13 13 14 14.5 15 16 16 17 17.5 | |
-- r 13.5 12.5 12.5 12 11.5 11.5 11 11 11 10.5 10 10 9.5 9.5 9.5 9.5 9.5 9.5 10 9.5 10 11 12 13 13 13.5 14 15 15.5 16.5 17.5 | |
-- o 14 13 13 12.5 12 12 11.5 11.5 11.5 11 10.5 10.5 10 10 10 10 9.5 10 10 10 10 10.5 11.5 12.5 13.5 13 13.5 14.5 15 16 17 | |
-- s 15 14 13 13.5 13 13 12.5 12.5 11.5 12 11.5 11.5 11 11 11 11 10.5 10.5 10.5 11 10.5 11 11.5 12.5 13.5 14 13.5 14.5 15 15 16 | |
-- c 16 15 14 13.5 14 14 13.5 13.5 12.5 12 12.5 12.5 12 12 12 12 11.5 11.5 11 11.5 11.5 11.5 12 12.5 13.5 14 14.5 14.5 15 16 16 | |
-- o 16.5 15.5 14.5 14 14 14.5 14 14 13 12.5 12.5 13 12.5 12.5 12.5 12.5 12 12 11.5 11.5 12 12 12 12.5 13 13.5 14 15 15 15.5 16.5 | |
-- p 17.5 16.5 15.5 15 14.5 15 15 14 14 13.5 13 13.5 13.5 13.5 13.5 12.5 13 12 12.5 12.5 12 13 13 13 13.5 13.5 14 15 15.5 16 16.5 | |
-- i 18 17 16 15.5 15 15 15.5 14.5 14.5 14 13.5 13.5 14 14 14 13 13 12.5 12.5 13 12.5 12.5 13.5 13.5 13.5 14 13.5 14.5 15 16 16.5 | |
-- c 19 18 17 16.5 16 16 15.5 15.5 15.5 15 14.5 14.5 14 15 15 14 13.5 13.5 13 13.5 13.5 13.5 13.5 14.5 14.5 14 14.5 14.5 15 16 17 | |
-- s 20 19 18 17.5 17 17 16.5 16.5 15.5 16 15.5 15.5 15 15 16 15 14.5 14.5 14 14 14 14.5 14.5 14.5 15.5 15 14.5 15.5 15 15 16 | |
-- i 20.5 19.5 18.5 18 17.5 17.5 17 17 16 16 16 16 15.5 15.5 15.5 15.5 15 15 14.5 14.5 14.5 14.5 15 15 15 15.5 15 15 15.5 15.5 15.5 | |
-- l 21.5 20.5 19.5 19 18.5 18.5 18 18 17 16.5 16.5 17 16.5 16.5 16.5 16.5 16 16 15.5 15.5 15 15.5 15.5 16 16 15.5 16 16 15.5 16.5 16.5 | |
-- i 22 21 20 19.5 19 19 18.5 18.5 17.5 17 17 17 17 17 17 17 16.5 16.5 16 16 15.5 15.5 16 16 16.5 16 15.5 16.5 16 16 17 | |
-- c 23 22 21 20.5 20 20 19.5 19.5 18.5 18 17.5 18 17.5 18 18 18 17.5 17.5 17 17 16.5 16.5 16.5 17 17 17 16.5 16.5 17 17 17 | |
-- o 23.5 22.5 21.5 21 20.5 20.5 20 20 19 18.5 18 18 18 18 18.5 18.5 18 18 17.5 17.5 17 17 17 17 17.5 17 17 17 17 17.5 17.5 | |
-- v 24.5 23.5 22.5 22 21.5 21.5 21 21 20 19.5 19 19 18.5 19 19 19.5 19 19 18.5 18.5 18 18 18 18 18 18 17.5 18 17.5 18 18.5 | |
-- o 25 24 23 22.5 22 22 21.5 21.5 20.5 20 19.5 19.5 19 19 19.5 19.5 19.5 19.5 19 19 18.5 18.5 18.5 18.5 18.5 18 18 18 18 18 18.5 | |
-- l 26 25 24 23.5 23 23 22.5 22.5 21.5 21 20.5 20.5 20 20 20 20.5 20 20.5 20 20 19.5 19.5 19.5 19.5 19.5 19 18.5 19 18.5 19 19 | |
-- c 27 26 25 24.5 24 24 23.5 23.5 22.5 22 21.5 21.5 21 21 21 21 21 21 21 21 20.5 20.5 20.5 20.5 20.5 20 19.5 19.5 19.5 19.5 20 | |
-- a 27.5 26.5 25.5 25 24.5 24.5 24 24 23 22.5 22 22 21.5 21.5 21.5 21.5 21.5 21.5 21 21.5 21 21 21 21 21 20.5 20 20 20 20 20 | |
-- n 28.5 27.5 26.5 26 25.5 25.5 25 25 24 23.5 23 23 22.5 22.5 22.5 22.5 22 22.5 22 22 22 22 22 22 22 21.5 21 21 20.5 21 21 | |
-- o 29 28 27 26.5 26 26 25.5 25.5 24.5 24 23.5 23.5 23 23 23 23 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22.5 22 21.5 21.5 21 21 21.5 | |
-- c 30 29 28 27.5 27 27 26.5 26.5 25.5 25 24.5 24.5 24 24 24 24 23.5 23.5 23 23.5 23 23.5 23.5 23.5 23.5 23 22.5 22.5 22 22 22 | |
-- o 30.5 29.5 28.5 28 27.5 27.5 27 27 26 25.5 25 25 24.5 24.5 24.5 24.5 24 24 23.5 23.5 23.5 23.5 24 24 24 23.5 23 23 22.5 22.5 22.5 | |
-- n 31.5 30.5 29.5 29 28.5 28.5 28 28 27 26.5 26 26 25.5 25.5 25.5 25.5 25 25 24.5 24.5 24 24.5 24.5 25 25 24.5 24 24 23.5 23.5 23.5 | |
-- i 32 31 30 29.5 29 29 28.5 28.5 27.5 27 26.5 26.5 26 26 26 26 25.5 25.5 25 25 24.5 24.5 25 25 25.5 25 24.5 24.5 24 24 24 | |
-- o 32.5 31.5 30.5 30 29.5 29.5 29 29 28 27.5 27 27 26.5 26.5 26.5 26.5 26 26 25.5 25.5 25 25 25 25.5 25.5 25.5 25 25 24.5 24.5 24.5 | |
-- s 33.5 32.5 31.5 31 30.5 30.5 30 30 29 28.5 28 28 27.5 27.5 27.5 27.5 27 27 26.5 26.5 26 26 26 26 26.5 26 26 26 25.5 24.5 25.5 | |
-- i 34 33 32 31.5 31 31 30.5 30.5 29.5 29 28.5 28.5 28 28 28 28 27.5 27.5 27 27 26.5 26.5 26.5 26.5 26.5 26.5 26 26.5 26 25 25 | |
-- s 35 34 33 32.5 32 32 31.5 31.5 30.5 30 29.5 29.5 29 29 29 29 28.5 28.5 28 28 27.5 27.5 27.5 27.5 27.5 27 27 27 27 26 26 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment