Skip to content

Instantly share code, notes, and snippets.

@sordina
Last active June 2, 2016 11:11
Show Gist options
  • Save sordina/c8d04a0feb1322bd48d553c98f7606f5 to your computer and use it in GitHub Desktop.
Save sordina/c8d04a0feb1322bd48d553c98f7606f5 to your computer and use it in GitHub Desktop.
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoMonomorphismRestriction #-}
import Data.HashMap.Lazy hiding (map)
import Data.Ord
import Data.Aeson
import Data.List (minimumBy, nub)
import Data.Text (pack)
import qualified Data.Text as T
import qualified Data.ByteString.Lazy.Char8 as LBS
-- Text Functions
type Text = T.Text
iT, sT, dT :: Int -> Char -> Text -> Text
iT x c t = T.concat [startT, midT, endT] where (startT, midT, endT) = (T.take x t, T.pack [c], T.drop x t)
sT x c t = T.concat [startT, midT, endT] where (startT, midT, endT) = (T.take x t, T.pack [c], T.drop (succ x) t)
dT x _ t = T.concat [startT, endT] where (startT, endT) = (T.take x t, T.drop (succ x) t)
prop_iT, prop_sT, prop_dT :: Bool
prop_iT = iT 1 'a' "hello" == "haello"
prop_sT = sT 1 'a' "hello" == "hallo"
prop_dT = dT 1 'x' "hello" == "hllo"
-- Data
data Cell = C { posi :: Int
, posj :: Int
, op :: Op
, char :: Char
, score :: Int } deriving (Eq, Show)
link :: Int -> Int -> Op -> Char -> Int -> [Cell] -> [Cell]
link i j o c s p = C i j o c s : p
scoreH :: [Cell] -> Int
scoreH p = score (head p)
-- Algo
data Op = Sub | Del | Ins | Nop deriving (Eq, Show)
levensteini :: Text -> Text -> [Cell]
levensteini a b = snd $ last $ levsl
where
subo = (Sub, (-1, -1))
delo = (Del, ( 0, -1))
inso = (Ins, (-1, 0))
offsets = [subo, delo, inso]
levsl = [ ((i, j), lev i j) | j <- uptop a, i <- uptop b ]
uptop x = [0 .. T.length x]
levsv = fromList levsl
ind w i = T.index w (pred i)
off i j = \(di, dj) -> (i + di, j + dj)
mini = minimumBy (comparing (score . head . snd))
lev 0 0 = [ C 0 0 Nop 'x' 0 ]
lev 0 j = link 0 j Del 'x' (1 + scoreH prec) prec where prec = (levsv ! (0, pred j))
lev i 0 = link i 0 Ins (ind b i) (1 + scoreH prec) prec where prec = (levsv ! (pred i, 0))
lev i j | (ind a j) == (ind b i) = leveq i j
| otherwise = levne i j
leveq i j = link i j Nop 'x' (scoreH prec) prec where prec = levsv ! off i j (snd subo)
levne i j = link i j o (ind b i) (1 + scoreH prec) prec where (o, prec) = mini [ (o', (levsv ! off i j d)) | (o', d) <- offsets]
levenstein :: Text -> Text -> Int
levenstein a b = score $ head $ levensteini a b
-- Start at negative one, since the first operation is always a NOP to introduce the word
levenfold :: Text -> [Cell] -> [(Int, Text)]
levenfold a cs = scanl reduce (-1, a) cs
reduce :: (Int, Text) -> Cell -> (Int, Text)
reduce (x, s) c | op c == Ins = (succ x, iT x (char c) s)
| op c == Sub = (succ x, sT x (char c) s)
| op c == Del = (x , dT x (char c) s)
| otherwise = (succ x, s)
levenwords :: Text -> Text -> [Text]
levenwords a b = nub $ map snd $ levenfold a (reverse (levensteini a b))
phrases :: [ Text ]
phrases = [ "We do data-science"
, "We do consulting"
, "We do design"
, "We do user-experience"
, "We do deep-learning"
, "We do products"
, "We do product-development"
, "We do software-engineering"
, "We do software-development"
, "We do software-science"
]
pairs :: [(Text, Text)]
pairs = zip phrases (tail phrases) ++ [(last phrases, head phrases)]
main :: IO ()
main = LBS.putStrLn $ encode $ map (uncurry levenwords) pairs
-- Properties
prop_cannon_0, prop_cannon_1, prop_cannon_2, prop_cannon_s :: Bool
prop_levenstein_start, prop_levenstein_end, prop_levenstein_same :: String -> Bool
prop_levenstein_sym, prop_levenstein_max, prop_start_state, prop_end_state, prop_steps :: String -> String -> Bool
prop_cannon_s = levenstein "a" "a" == 0
prop_cannon_0 = levenstein "abcde" "abcde" == 0
prop_cannon_1 = levenstein "kitten" "sitting" == 3
prop_cannon_2 = levenstein "Saturday" "Sunday" == 3
prop_levenstein_start s = levenstein "" t == T.length t where t = pack s
prop_levenstein_end s = levenstein t "" == T.length t where t = pack s
prop_levenstein_same s = levenstein t t == 0 where t = pack s
prop_levenstein_sym a b = levenstein ta tb == levenstein tb ta where (ta, tb) = (pack (take 50 a), pack (take 50 b))
prop_levenstein_max a b = levenstein ta tb <= max (T.length ta) (T.length tb) where (ta, tb) = (pack (take 50 a), pack (take 50 b))
prop_start_state a b = ta == head (levenwords ta tb) where (ta, tb) = (pack ("x" ++ a), pack ("x" ++ b))
prop_end_state a b = tb == last (levenwords ta tb) where (ta, tb) = (pack ("x" ++ a), pack ("x" ++ b))
prop_steps a b = length (levensteini ta tb) == 1 + max (T.length ta) (T.length tb) where (ta, tb) = (pack a, pack b)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment