-
-
Save anonymous/91295c6236d14d7b4446 to your computer and use it in GitHub Desktop.
Compile: ghc Miller.hs -O2 -fllvm -prof -fprof-auto Run: ./Miller +RTS -s -p -hy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE BangPatterns #-} | |
-- | | |
-- Implementation of "An O(NP) Sequence Comparison Algorithm" by Sun Wu, Udi | |
-- Manber, Gene Myers and Web Miller | |
-- Note: P=(D-(M-N))/2 where D is shortest edit distance, M,N sequence lengths, | |
-- M >= N | |
-- | |
import Data.Vector.Unboxed.Mutable as MU | |
import Data.Vector.Unboxed as U hiding (mapM_) | |
import Control.Monad.ST as ST | |
import Control.Monad.Primitive (PrimState) | |
import Control.Monad (when) | |
import Data.STRef (newSTRef, modifySTRef, readSTRef) | |
import Data.Int | |
type MVI1 s = MVector (PrimState (ST s)) Int | |
cmp :: U.Vector Int32 -> U.Vector Int32 -> Int -> Int -> Int | |
cmp a b i j = go 0 i j | |
where | |
n = U.length a | |
m = U.length b | |
go !len !i !j| (i<n) && (j<m) && ((unsafeIndex a i) == (unsafeIndex b j)) = go (len+1) (i+1) (j+1) | |
| otherwise = len | |
{-#INLINE cmp #-} | |
newVI1 :: Int -> Int -> ST s (MVI1 s) | |
newVI1 n x = do | |
a <- new n | |
mapM_ (\i -> MU.unsafeWrite a i x) [0..n-1] | |
return a | |
-- function to find previous y on diagonal k for furthest point | |
findYP :: MVI1 s -> Int -> Int -> ST s (Int,Int) | |
findYP fp k offset = do | |
let k0 = k+offset-1 | |
k1 = k+offset+1 | |
y0 <- MU.unsafeRead fp k0 >>= \x -> return $ 1+x | |
y1 <- MU.unsafeRead fp k1 | |
if y0 > y1 then return (k0,y0) | |
else return (k1,y1) | |
{-#INLINE findYP #-} | |
gridWalk :: Vector Int32 -> Vector Int32 -> MVI1 s -> Int -> (Vector Int32 -> Vector Int32 -> Int -> Int -> Int) -> ST s () | |
gridWalk a b fp !k cmp = {-#SCC gridWalk #-} do | |
let !offset = 1+U.length a | |
(!kp,!yp) <- {-#SCC findYP #-} findYP fp k offset | |
let xp = yp-k | |
len = {-#SCC cmp #-} cmp a b xp yp | |
x = xp+len | |
y = yp+len | |
{-#SCC "updateFP" #-} MU.unsafeWrite fp (k+offset) y | |
return () | |
{-#INLINE gridWalk #-} | |
-- As noted in the paper, we find furthest point given diagonal k, and current set of furthest points. | |
-- The function below executes ct times, and updates furthest point (fp), snake node indices in snake | |
-- array (snodes), and inserts new snakes in snake array as they are found during furthest point search | |
findSnakes :: Vector Int32 -> Vector Int32 -> MVI1 s -> Int -> Int -> (Vector Int32 -> Vector Int32 -> Int -> Int -> Int) -> (Int -> Int -> Int) -> ST s () | |
findSnakes a b fp !k !ct cmp op = {-#SCC findSnakes #-} U.forM_ (U.fromList [0..ct-1]) (\x -> gridWalk a b fp (op k x) cmp) | |
{-#INLINE findSnakes #-} | |
-- while tail-recursive loop courtesy of Gabriel Gonzalez | |
-- see http://www.haskellforall.com/2012/01/haskell-for-c-programmers-for-loops.html | |
while :: (Monad m) => m Bool -> m a -> m () | |
while !cond !action = do | |
c <- cond | |
when c $ do | |
action | |
while cond action | |
-- Helper function for lcs | |
lcsh :: Vector Int32 -> Vector Int32 -> Bool -> Int | |
lcsh a b flip = runST $ do | |
let n = U.length a | |
m = U.length b | |
delta = m-n | |
offset=n+1 | |
deloff=m+1 -- array index location of diagonal at delta | |
fp <- newVI1 (m+n+3) (-1) -- array of furthest points | |
p <- newSTRef 0 -- minimum number of deletions for a | |
while (MU.unsafeRead fp (deloff) >>= \x -> return $! (x<m)) | |
$ do | |
n <- readSTRef p | |
findSnakes a b fp (-n) (delta+n) cmp (+) -- vertical traversal | |
findSnakes a b fp (delta+n) n cmp (-) -- horizontal traversal | |
findSnakes a b fp delta 1 cmp (-) -- diagonal traversal | |
modifySTRef p (+1) -- increment p by 1 | |
-- length of LCS is n-p. p must be decremented by 1 first for correct value of p | |
readSTRef p >>= \x -> return $ (U.length a)-(x-1) | |
{-#INLINABLE lcsh #-} | |
-- Function to find longest common subsequence given unboxed vectors a and b | |
-- It returns indices of LCS in a and b | |
lcs :: Vector Int32 -> Vector Int32 -> Int | |
lcs a b | (U.length a > U.length b) = lcsh b a True | |
| otherwise = lcsh a b False | |
{-#INLINABLE lcs #-} | |
main = print $ lcs (U.fromList [0..3999]) (U.fromList [3999..7999]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment