sordina/levenstein2.hs

## levenstein2.hs
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoMonomorphismRestriction #-}

import Data.HashMap.Lazy hiding (map)
import Data.Ord
import Data.Aeson
import Data.List (minimumBy, nub)
import Data.Text (pack)
import qualified Data.Text as T
import qualified Data.ByteString.Lazy.Char8 as LBS

-- Text Functions

type Text = T.Text

iT, sT, dT :: Int -> Char -> Text -> Text

iT x c t = T.concat [startT, midT, endT] where (startT, midT, endT) = (T.take x t, T.pack [c], T.drop x        t)
sT x c t = T.concat [startT, midT, endT] where (startT, midT, endT) = (T.take x t, T.pack [c], T.drop (succ x) t)
dT x _ t = T.concat [startT,       endT] where (startT,       endT) = (T.take x t,             T.drop (succ x) t)

prop_iT, prop_sT, prop_dT :: Bool
prop_iT = iT 1 'a' "hello" == "haello"
prop_sT = sT 1 'a' "hello" == "hallo"
prop_dT = dT 1 'x' "hello" == "hllo"

-- Data

data Cell = C { posi  :: Int
              , posj  :: Int
              , op    :: Op
              , char  :: Char
              , score :: Int } deriving (Eq, Show)

link :: Int -> Int -> Op -> Char -> Int -> [Cell] -> [Cell]
link i j o c s p = C i j o c s : p

scoreH :: [Cell] -> Int
scoreH p = score (head p)

-- Algo

data Op = Sub | Del | Ins | Nop deriving (Eq, Show)

levensteini :: Text -> Text -> [Cell]
levensteini a b = snd $ last $ levsl
  where
  subo      = (Sub, (-1, -1))
  delo      = (Del, ( 0, -1))
  inso      = (Ins, (-1,  0))
  offsets   = [subo, delo, inso]

  levsl     = [ ((i, j), lev i j) | j <- uptop a, i <- uptop b ]
  uptop x   = [0 .. T.length x]
  levsv     = fromList levsl
  ind   w i = T.index w (pred i)
  off   i j = \(di, dj) -> (i + di, j + dj)
  mini      = minimumBy (comparing (score . head . snd))

  lev   0 0 = [ C 0 0 Nop 'x' 0 ]
  lev   0 j = link 0 j Del 'x'       (1 + scoreH prec) prec where prec = (levsv ! (0, pred j))
  lev   i 0 = link i 0 Ins (ind b i) (1 + scoreH prec) prec where prec = (levsv ! (pred i, 0))
  lev   i j | (ind a j) == (ind b i) = leveq i j
            | otherwise              = levne i j

  leveq i j = link i j Nop 'x'       (scoreH prec)     prec where prec = levsv ! off i j (snd subo)
  levne i j = link i j o   (ind b i) (1 + scoreH prec) prec where (o, prec) = mini [ (o', (levsv ! off i j d)) | (o', d) <- offsets]

levenstein :: Text -> Text -> Int
levenstein a b = score $ head $ levensteini a b

-- Start at negative one, since the first operation is always a NOP to introduce the word
levenfold :: Text -> [Cell] -> [(Int, Text)]
levenfold a cs = scanl reduce (-1, a) cs

reduce :: (Int, Text) -> Cell -> (Int, Text)
reduce (x, s) c | op c == Ins = (succ x, iT x (char c) s)
                | op c == Sub = (succ x, sT x (char c) s)
                | op c == Del = (x     , dT x (char c) s)
                | otherwise   = (succ x, s)

levenwords :: Text -> Text -> [Text]
levenwords a b = nub $ map snd $ levenfold a (reverse (levensteini a b))


phrases :: [ Text ]
phrases = [ "We do data-science"
          , "We do consulting"
          , "We do design"
          , "We do user-experience"
          , "We do deep-learning"
          , "We do products"
          , "We do product-development"
          , "We do software-engineering"
          , "We do software-development"
          , "We do software-science"
          ]

pairs :: [(Text, Text)]
pairs = zip phrases (tail phrases) ++ [(last phrases, head phrases)]


main :: IO ()
main = LBS.putStrLn $ encode $ map (uncurry levenwords) pairs

-- Properties

prop_cannon_0, prop_cannon_1, prop_cannon_2, prop_cannon_s :: Bool
prop_levenstein_start, prop_levenstein_end, prop_levenstein_same :: String -> Bool
prop_levenstein_sym,   prop_levenstein_max, prop_start_state, prop_end_state, prop_steps :: String -> String -> Bool

prop_cannon_s           = levenstein "a"        "a"       == 0
prop_cannon_0           = levenstein "abcde"    "abcde"   == 0
prop_cannon_1           = levenstein "kitten"   "sitting" == 3
prop_cannon_2           = levenstein "Saturday" "Sunday"  == 3
prop_levenstein_start s = levenstein ""         t         == T.length t                      where t = pack s
prop_levenstein_end   s = levenstein t          ""        == T.length t                      where t = pack s
prop_levenstein_same  s = levenstein t          t         == 0                               where t = pack s
prop_levenstein_sym a b = levenstein ta         tb        == levenstein tb ta                where (ta, tb) = (pack (take 50 a), pack (take 50 b))
prop_levenstein_max a b = levenstein ta         tb        <= max (T.length ta) (T.length tb) where (ta, tb) = (pack (take 50 a), pack (take 50 b))
prop_start_state    a b = ta == head (levenwords ta tb) where (ta, tb) = (pack ("x" ++ a), pack ("x" ++ b))
prop_end_state      a b = tb == last (levenwords ta tb) where (ta, tb) = (pack ("x" ++ a), pack ("x" ++ b))
prop_steps          a b = length (levensteini ta tb) == 1 + max (T.length ta) (T.length tb) where (ta, tb) = (pack a, pack b)
	{-# LANGUAGE OverloadedStrings #-}
	{-# LANGUAGE NoMonomorphismRestriction #-}

	import Data.HashMap.Lazy hiding (map)
	import Data.Ord
	import Data.Aeson
	import Data.List (minimumBy, nub)
	import Data.Text (pack)
	import qualified Data.Text as T
	import qualified Data.ByteString.Lazy.Char8 as LBS

	-- Text Functions

	type Text = T.Text

	iT, sT, dT :: Int -> Char -> Text -> Text

	iT x c t = T.concat [startT, midT, endT] where (startT, midT, endT) = (T.take x t, T.pack [c], T.drop x t)
	sT x c t = T.concat [startT, midT, endT] where (startT, midT, endT) = (T.take x t, T.pack [c], T.drop (succ x) t)
	dT x _ t = T.concat [startT, endT] where (startT, endT) = (T.take x t, T.drop (succ x) t)

	prop_iT, prop_sT, prop_dT :: Bool
	prop_iT = iT 1 'a' "hello" == "haello"
	prop_sT = sT 1 'a' "hello" == "hallo"
	prop_dT = dT 1 'x' "hello" == "hllo"

	-- Data

	data Cell = C { posi :: Int
	, posj :: Int
	, op :: Op
	, char :: Char
	, score :: Int } deriving (Eq, Show)

	link :: Int -> Int -> Op -> Char -> Int -> [Cell] -> [Cell]
	link i j o c s p = C i j o c s : p

	scoreH :: [Cell] -> Int
	scoreH p = score (head p)

	-- Algo

	data Op = Sub \| Del \| Ins \| Nop deriving (Eq, Show)

	levensteini :: Text -> Text -> [Cell]
	levensteini a b = snd $ last $ levsl
	where
	subo = (Sub, (-1, -1))
	delo = (Del, ( 0, -1))
	inso = (Ins, (-1, 0))
	offsets = [subo, delo, inso]

	levsl = [ ((i, j), lev i j) \| j <- uptop a, i <- uptop b ]
	uptop x = [0 .. T.length x]
	levsv = fromList levsl
	ind w i = T.index w (pred i)
	off i j = \(di, dj) -> (i + di, j + dj)
	mini = minimumBy (comparing (score . head . snd))

	lev 0 0 = [ C 0 0 Nop 'x' 0 ]
	lev 0 j = link 0 j Del 'x' (1 + scoreH prec) prec where prec = (levsv ! (0, pred j))
	lev i 0 = link i 0 Ins (ind b i) (1 + scoreH prec) prec where prec = (levsv ! (pred i, 0))
	lev i j \| (ind a j) == (ind b i) = leveq i j
	\| otherwise = levne i j

	leveq i j = link i j Nop 'x' (scoreH prec) prec where prec = levsv ! off i j (snd subo)
	levne i j = link i j o (ind b i) (1 + scoreH prec) prec where (o, prec) = mini [ (o', (levsv ! off i j d)) \| (o', d) <- offsets]

	levenstein :: Text -> Text -> Int
	levenstein a b = score $ head $ levensteini a b

	-- Start at negative one, since the first operation is always a NOP to introduce the word
	levenfold :: Text -> [Cell] -> [(Int, Text)]
	levenfold a cs = scanl reduce (-1, a) cs

	reduce :: (Int, Text) -> Cell -> (Int, Text)
	reduce (x, s) c \| op c == Ins = (succ x, iT x (char c) s)
	\| op c == Sub = (succ x, sT x (char c) s)
	\| op c == Del = (x , dT x (char c) s)
	\| otherwise = (succ x, s)

	levenwords :: Text -> Text -> [Text]
	levenwords a b = nub $ map snd $ levenfold a (reverse (levensteini a b))


	phrases :: [ Text ]
	phrases = [ "We do data-science"
	, "We do consulting"
	, "We do design"
	, "We do user-experience"
	, "We do deep-learning"
	, "We do products"
	, "We do product-development"
	, "We do software-engineering"
	, "We do software-development"
	, "We do software-science"
	]

	pairs :: [(Text, Text)]
	pairs = zip phrases (tail phrases) ++ [(last phrases, head phrases)]


	main :: IO ()
	main = LBS.putStrLn $ encode $ map (uncurry levenwords) pairs

	-- Properties

	prop_cannon_0, prop_cannon_1, prop_cannon_2, prop_cannon_s :: Bool
	prop_levenstein_start, prop_levenstein_end, prop_levenstein_same :: String -> Bool
	prop_levenstein_sym, prop_levenstein_max, prop_start_state, prop_end_state, prop_steps :: String -> String -> Bool

	prop_cannon_s = levenstein "a" "a" == 0
	prop_cannon_0 = levenstein "abcde" "abcde" == 0
	prop_cannon_1 = levenstein "kitten" "sitting" == 3
	prop_cannon_2 = levenstein "Saturday" "Sunday" == 3
	prop_levenstein_start s = levenstein "" t == T.length t where t = pack s
	prop_levenstein_end s = levenstein t "" == T.length t where t = pack s
	prop_levenstein_same s = levenstein t t == 0 where t = pack s
	prop_levenstein_sym a b = levenstein ta tb == levenstein tb ta where (ta, tb) = (pack (take 50 a), pack (take 50 b))
	prop_levenstein_max a b = levenstein ta tb <= max (T.length ta) (T.length tb) where (ta, tb) = (pack (take 50 a), pack (take 50 b))
	prop_start_state a b = ta == head (levenwords ta tb) where (ta, tb) = (pack ("x" ++ a), pack ("x" ++ b))
	prop_end_state a b = tb == last (levenwords ta tb) where (ta, tb) = (pack ("x" ++ a), pack ("x" ++ b))
	prop_steps a b = length (levensteini ta tb) == 1 + max (T.length ta) (T.length tb) where (ta, tb) = (pack a, pack b)