phadej/Levenshtein.hs

## Levenshtein.hs
module Main where

import Control.Monad
import Control.Monad.ST
import Data.Array.ST
import Data.Array.Unboxed

import Criterion.Main (defaultMain, bgroup, bench, whnf)

import Debug.Trace

simple :: String -> String -> Int
simple [] b             = length b
simple a []             = length a
{- Same from definition
simple a b | min (length a) (length b) == 0
                        = max (length a) (length b)
-}
-- both are not null
simple a@(x:xs) b@(y:ys) = minimum [ simple xs b + 1
                                   , simple a ys + 1
                                   , simple xs ys + if x == y then 0 else 1
                                   ]

-- Polynomial, using lazyness
polynomial :: String -> String -> Int
polynomial s t
  | s == t      = 0
polynomial s [] = length s
polynomial [] t = length t
polynomial s t  = a !! length s !! length t
  where a     = [[f i j | j <- [0..length t]] |  i <- [0..length s]]
        f 0 j = j
        f i 0 = i
        f i j = minimum [ (a !! i !! (j - 1)) + 1
                        , (a !! (i - 1) !! j) + 1
                        , (a !! (i - 1) !! (j - 1)) + if (s !! (i - 1)) == (t !! (j - 1)) then 0 else 1
                        ]

-- Loosely following the iterative C version
imperative :: String -> String -> Int
imperative a b | a == b = 0
imperative [] b         = length b
imperative a []         = length a
imperative a b          = runST $ do
  -- create two work vectors of integer distances
  v0 <- newArray_ (0, length b) :: ST s (STUArray s Int Int)
  v1 <- newArray_ (0, length b) :: ST s (STUArray s Int Int)

  -- initialize v0 (the previous row of distances)
  -- this row is A[0][i]: edit distance for an empty s
  -- the distance is just the number of characters to delete from t
  forM_ (range (0, length b)) $ \i -> do
    writeArray v0 i i

  forM_ (range (0, length a - 1)) $ \i -> do
    -- calculate v1 (current row distances) from the previous row v0

    {- Debug
    list <- getElems v0
    traceShow list (return ())
    --}

    -- first element of v1 is A[i+1][0]
    --   edit distance is delete (i+1) chars from s to match empty t
    writeArray v1 0 (i + 1)

    -- use formula to fill in the rest of the row
    forM_ (range (0, length b - 1)) $ \j -> do
      v1j <- readArray v1 j
      v0j <- readArray v0 j
      v0j1 <- readArray v0 $ j + 1
      writeArray v1 (j + 1) $ minimum [v1j + 1, v0j1 + 1, v0j + if (aArray ! i == bArray ! j) then 0 else 1]
    -- copy v1 (current row) to v0 (previous row) for next interation
    forM_ (range (0, length b)) $ \j -> do
      t <- readArray v1 j
      writeArray v0 j t
  -- return
  readArray v1 (length b)
    where aArray = listArray (0, length a - 1) a :: UArray Int Char
          bArray = listArray (0, length b - 1) b :: UArray Int Char

longword :: Int -> String
longword n = take n $ concat $ repeat "qwerty"

test :: String -> String -> Bool
test a b = s == p && s == i
  where s = simple a b
        p = polynomial a b
        i = imperative a b

main :: IO ()
main = defaultMain [
    bgroup "levenshtein" [
      bench "simple" $ whnf (simple a2) b2
    , bench "polynomial" $ whnf (polynomial a2) b2
    ],
    bgroup "polynomials" [
      bench "polynomial" $ whnf (polynomial a10) b10
    , bench "imperative" $ whnf (imperative a10) b10
    ]
  ]
  where a2 = longword2 ++ "kitten" ++ longword2
        b2 = longword2 ++ "sitting" ++ longword2
        a10 = longword10 ++ "kitten" ++ longword10
        b10 = longword10 ++ "sitting" ++ longword10
        longword2 = longword 2
        longword10 = longword 10

## results.txt
warming up
estimating clock resolution...
mean is 3.218045 us (160001 iterations)
found 1039 outliers among 159999 samples (0.6%)
  748 (0.5%) high severe
estimating cost of a clock call...
mean is 94.39777 ns (22 iterations)
found 2 outliers among 22 samples (9.1%)
  1 (4.5%) high mild
  1 (4.5%) high severe

benchmarking levenshtein/simple
collecting 100 samples, 1 iterations each, in estimated 61.04770 s
mean: 632.8908 ms, lb 619.3514 ms, ub 649.5024 ms, ci 0.950
std dev: 76.98222 ms, lb 65.24029 ms, ub 95.02371 ms, ci 0.950
found 3 outliers among 100 samples (3.0%)
  3 (3.0%) high mild
variance introduced by outliers: 85.219%
variance is severely inflated by outliers

benchmarking levenshtein/polynomial
mean: 29.04588 us, lb 28.44555 us, ub 29.80340 us, ci 0.950
std dev: 3.446563 us, lb 2.864958 us, ub 4.431689 us, ci 0.950
found 4 outliers among 100 samples (4.0%)
  3 (3.0%) high mild
  1 (1.0%) high severe
variance introduced by outliers: 84.198%
variance is severely inflated by outliers

benchmarking polynomials/polynomial
mean: 500.2735 us, lb 482.4497 us, ub 530.7803 us, ci 0.950
std dev: 117.2728 us, lb 78.34104 us, ub 186.7606 us, ci 0.950
found 6 outliers among 100 samples (6.0%)
  2 (2.0%) high mild
  4 (4.0%) high severe
variance introduced by outliers: 95.738%
variance is severely inflated by outliers

benchmarking polynomials/imperative
mean: 58.20797 us, lb 56.37413 us, ub 60.21839 us, ci 0.950
std dev: 9.846183 us, lb 8.873440 us, ub 11.31322 us, ci 0.950
variance introduced by outliers: 91.538%
variance is severely inflated by outliers
	module Main where

	import Control.Monad
	import Control.Monad.ST
	import Data.Array.ST
	import Data.Array.Unboxed

	import Criterion.Main (defaultMain, bgroup, bench, whnf)

	import Debug.Trace

	simple :: String -> String -> Int
	simple [] b = length b
	simple a [] = length a
	{- Same from definition
	simple a b \| min (length a) (length b) == 0
	= max (length a) (length b)
	-}
	-- both are not null
	simple a@(x:xs) b@(y:ys) = minimum [ simple xs b + 1
	, simple a ys + 1
	, simple xs ys + if x == y then 0 else 1
	]

	-- Polynomial, using lazyness
	polynomial :: String -> String -> Int
	polynomial s t
	\| s == t = 0
	polynomial s [] = length s
	polynomial [] t = length t
	polynomial s t = a !! length s !! length t
	where a = [[f i j \| j <- [0..length t]] \| i <- [0..length s]]
	f 0 j = j
	f i 0 = i
	f i j = minimum [ (a !! i !! (j - 1)) + 1
	, (a !! (i - 1) !! j) + 1
	, (a !! (i - 1) !! (j - 1)) + if (s !! (i - 1)) == (t !! (j - 1)) then 0 else 1
	]

	-- Loosely following the iterative C version
	imperative :: String -> String -> Int
	imperative a b \| a == b = 0
	imperative [] b = length b
	imperative a [] = length a
	imperative a b = runST $ do
	-- create two work vectors of integer distances
	v0 <- newArray_ (0, length b) :: ST s (STUArray s Int Int)
	v1 <- newArray_ (0, length b) :: ST s (STUArray s Int Int)

	-- initialize v0 (the previous row of distances)
	-- this row is A[0][i]: edit distance for an empty s
	-- the distance is just the number of characters to delete from t
	forM_ (range (0, length b)) $ \i -> do
	writeArray v0 i i

	forM_ (range (0, length a - 1)) $ \i -> do
	-- calculate v1 (current row distances) from the previous row v0

	{- Debug
	list <- getElems v0
	traceShow list (return ())
	--}

	-- first element of v1 is A[i+1][0]
	-- edit distance is delete (i+1) chars from s to match empty t
	writeArray v1 0 (i + 1)

	-- use formula to fill in the rest of the row
	forM_ (range (0, length b - 1)) $ \j -> do
	v1j <- readArray v1 j
	v0j <- readArray v0 j
	v0j1 <- readArray v0 $ j + 1
	writeArray v1 (j + 1) $ minimum [v1j + 1, v0j1 + 1, v0j + if (aArray ! i == bArray ! j) then 0 else 1]
	-- copy v1 (current row) to v0 (previous row) for next interation
	forM_ (range (0, length b)) $ \j -> do
	t <- readArray v1 j
	writeArray v0 j t
	-- return
	readArray v1 (length b)
	where aArray = listArray (0, length a - 1) a :: UArray Int Char
	bArray = listArray (0, length b - 1) b :: UArray Int Char

	longword :: Int -> String
	longword n = take n $ concat $ repeat "qwerty"

	test :: String -> String -> Bool
	test a b = s == p && s == i
	where s = simple a b
	p = polynomial a b
	i = imperative a b

	main :: IO ()
	main = defaultMain [
	bgroup "levenshtein" [
	bench "simple" $ whnf (simple a2) b2
	, bench "polynomial" $ whnf (polynomial a2) b2
	],
	bgroup "polynomials" [
	bench "polynomial" $ whnf (polynomial a10) b10
	, bench "imperative" $ whnf (imperative a10) b10
	]
	]
	where a2 = longword2 ++ "kitten" ++ longword2
	b2 = longword2 ++ "sitting" ++ longword2
	a10 = longword10 ++ "kitten" ++ longword10
	b10 = longword10 ++ "sitting" ++ longword10
	longword2 = longword 2
	longword10 = longword 10
	warming up
	estimating clock resolution...
	mean is 3.218045 us (160001 iterations)
	found 1039 outliers among 159999 samples (0.6%)
	748 (0.5%) high severe
	estimating cost of a clock call...
	mean is 94.39777 ns (22 iterations)
	found 2 outliers among 22 samples (9.1%)
	1 (4.5%) high mild
	1 (4.5%) high severe

	benchmarking levenshtein/simple
	collecting 100 samples, 1 iterations each, in estimated 61.04770 s
	mean: 632.8908 ms, lb 619.3514 ms, ub 649.5024 ms, ci 0.950
	std dev: 76.98222 ms, lb 65.24029 ms, ub 95.02371 ms, ci 0.950
	found 3 outliers among 100 samples (3.0%)
	3 (3.0%) high mild
	variance introduced by outliers: 85.219%
	variance is severely inflated by outliers

	benchmarking levenshtein/polynomial
	mean: 29.04588 us, lb 28.44555 us, ub 29.80340 us, ci 0.950
	std dev: 3.446563 us, lb 2.864958 us, ub 4.431689 us, ci 0.950
	found 4 outliers among 100 samples (4.0%)
	3 (3.0%) high mild
	1 (1.0%) high severe
	variance introduced by outliers: 84.198%
	variance is severely inflated by outliers

	benchmarking polynomials/polynomial
	mean: 500.2735 us, lb 482.4497 us, ub 530.7803 us, ci 0.950
	std dev: 117.2728 us, lb 78.34104 us, ub 186.7606 us, ci 0.950
	found 6 outliers among 100 samples (6.0%)
	2 (2.0%) high mild
	4 (4.0%) high severe
	variance introduced by outliers: 95.738%
	variance is severely inflated by outliers

	benchmarking polynomials/imperative
	mean: 58.20797 us, lb 56.37413 us, ub 60.21839 us, ci 0.950
	std dev: 9.846183 us, lb 8.873440 us, ub 11.31322 us, ci 0.950
	variance introduced by outliers: 91.538%
	variance is severely inflated by outliers