Last active
June 2, 2016 11:11
-
-
Save sordina/c8d04a0feb1322bd48d553c98f7606f5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE OverloadedStrings #-} | |
{-# LANGUAGE NoMonomorphismRestriction #-} | |
import Data.HashMap.Lazy hiding (map) | |
import Data.Ord | |
import Data.Aeson | |
import Data.List (minimumBy, nub) | |
import Data.Text (pack) | |
import qualified Data.Text as T | |
import qualified Data.ByteString.Lazy.Char8 as LBS | |
-- Text Functions | |
type Text = T.Text | |
iT, sT, dT :: Int -> Char -> Text -> Text | |
iT x c t = T.concat [startT, midT, endT] where (startT, midT, endT) = (T.take x t, T.pack [c], T.drop x t) | |
sT x c t = T.concat [startT, midT, endT] where (startT, midT, endT) = (T.take x t, T.pack [c], T.drop (succ x) t) | |
dT x _ t = T.concat [startT, endT] where (startT, endT) = (T.take x t, T.drop (succ x) t) | |
prop_iT, prop_sT, prop_dT :: Bool | |
prop_iT = iT 1 'a' "hello" == "haello" | |
prop_sT = sT 1 'a' "hello" == "hallo" | |
prop_dT = dT 1 'x' "hello" == "hllo" | |
-- Data | |
data Cell = C { posi :: Int | |
, posj :: Int | |
, op :: Op | |
, char :: Char | |
, score :: Int } deriving (Eq, Show) | |
link :: Int -> Int -> Op -> Char -> Int -> [Cell] -> [Cell] | |
link i j o c s p = C i j o c s : p | |
scoreH :: [Cell] -> Int | |
scoreH p = score (head p) | |
-- Algo | |
data Op = Sub | Del | Ins | Nop deriving (Eq, Show) | |
levensteini :: Text -> Text -> [Cell] | |
levensteini a b = snd $ last $ levsl | |
where | |
subo = (Sub, (-1, -1)) | |
delo = (Del, ( 0, -1)) | |
inso = (Ins, (-1, 0)) | |
offsets = [subo, delo, inso] | |
levsl = [ ((i, j), lev i j) | j <- uptop a, i <- uptop b ] | |
uptop x = [0 .. T.length x] | |
levsv = fromList levsl | |
ind w i = T.index w (pred i) | |
off i j = \(di, dj) -> (i + di, j + dj) | |
mini = minimumBy (comparing (score . head . snd)) | |
lev 0 0 = [ C 0 0 Nop 'x' 0 ] | |
lev 0 j = link 0 j Del 'x' (1 + scoreH prec) prec where prec = (levsv ! (0, pred j)) | |
lev i 0 = link i 0 Ins (ind b i) (1 + scoreH prec) prec where prec = (levsv ! (pred i, 0)) | |
lev i j | (ind a j) == (ind b i) = leveq i j | |
| otherwise = levne i j | |
leveq i j = link i j Nop 'x' (scoreH prec) prec where prec = levsv ! off i j (snd subo) | |
levne i j = link i j o (ind b i) (1 + scoreH prec) prec where (o, prec) = mini [ (o', (levsv ! off i j d)) | (o', d) <- offsets] | |
levenstein :: Text -> Text -> Int | |
levenstein a b = score $ head $ levensteini a b | |
-- Start at negative one, since the first operation is always a NOP to introduce the word | |
levenfold :: Text -> [Cell] -> [(Int, Text)] | |
levenfold a cs = scanl reduce (-1, a) cs | |
reduce :: (Int, Text) -> Cell -> (Int, Text) | |
reduce (x, s) c | op c == Ins = (succ x, iT x (char c) s) | |
| op c == Sub = (succ x, sT x (char c) s) | |
| op c == Del = (x , dT x (char c) s) | |
| otherwise = (succ x, s) | |
levenwords :: Text -> Text -> [Text] | |
levenwords a b = nub $ map snd $ levenfold a (reverse (levensteini a b)) | |
phrases :: [ Text ] | |
phrases = [ "We do data-science" | |
, "We do consulting" | |
, "We do design" | |
, "We do user-experience" | |
, "We do deep-learning" | |
, "We do products" | |
, "We do product-development" | |
, "We do software-engineering" | |
, "We do software-development" | |
, "We do software-science" | |
] | |
pairs :: [(Text, Text)] | |
pairs = zip phrases (tail phrases) ++ [(last phrases, head phrases)] | |
main :: IO () | |
main = LBS.putStrLn $ encode $ map (uncurry levenwords) pairs | |
-- Properties | |
prop_cannon_0, prop_cannon_1, prop_cannon_2, prop_cannon_s :: Bool | |
prop_levenstein_start, prop_levenstein_end, prop_levenstein_same :: String -> Bool | |
prop_levenstein_sym, prop_levenstein_max, prop_start_state, prop_end_state, prop_steps :: String -> String -> Bool | |
prop_cannon_s = levenstein "a" "a" == 0 | |
prop_cannon_0 = levenstein "abcde" "abcde" == 0 | |
prop_cannon_1 = levenstein "kitten" "sitting" == 3 | |
prop_cannon_2 = levenstein "Saturday" "Sunday" == 3 | |
prop_levenstein_start s = levenstein "" t == T.length t where t = pack s | |
prop_levenstein_end s = levenstein t "" == T.length t where t = pack s | |
prop_levenstein_same s = levenstein t t == 0 where t = pack s | |
prop_levenstein_sym a b = levenstein ta tb == levenstein tb ta where (ta, tb) = (pack (take 50 a), pack (take 50 b)) | |
prop_levenstein_max a b = levenstein ta tb <= max (T.length ta) (T.length tb) where (ta, tb) = (pack (take 50 a), pack (take 50 b)) | |
prop_start_state a b = ta == head (levenwords ta tb) where (ta, tb) = (pack ("x" ++ a), pack ("x" ++ b)) | |
prop_end_state a b = tb == last (levenwords ta tb) where (ta, tb) = (pack ("x" ++ a), pack ("x" ++ b)) | |
prop_steps a b = length (levensteini ta tb) == 1 + max (T.length ta) (T.length tb) where (ta, tb) = (pack a, pack b) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment