Skip to content

Instantly share code, notes, and snippets.

/Diff.hs Secret

Created June 6, 2013 18:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/c62f25466b407df73d7b to your computer and use it in GitHub Desktop.
Save anonymous/c62f25466b407df73d7b to your computer and use it in GitHub Desktop.
Diff.hs - compile with -O2 -fllvm miller.c and test.c: Makefile attached
{-# LANGUAGE BangPatterns #-}
-- |
-- Implementation of "An O(NP) Sequence Comparison Algorithm" by Sun Wu, Udi
-- Manber, Gene Myers and Web Miller
-- Note: P=(D-(M-N))/2 where D is shortest edit distance, M,N sequence lengths,
-- M >= N
--
import Data.Vector.Unboxed.Mutable as MU
import Data.Vector.Unboxed as U hiding (mapM_)
import Control.Monad.ST as ST
import Control.Monad.Primitive (PrimState)
import Control.Monad as CM (when,forM_)
import Data.STRef (newSTRef, modifySTRef, readSTRef, writeSTRef)
import Data.Int
type MVI1 s = MVector (PrimState (ST s)) Int
cmp :: U.Vector Int32 -> U.Vector Int32 -> Int -> Int -> Int
cmp a b i j = go 0 i j
where
n = U.length a
m = U.length b
go !len !i !j| (i<n) && (j<m) && ((unsafeIndex a i) == (unsafeIndex b j)) = go (len+1) (i+1) (j+1)
| otherwise = len
{-#INLINE cmp #-}
newVI1 :: Int -> Int -> ST s (MVI1 s)
newVI1 n x = do
a <- new n
mapM_ (\i -> MU.unsafeWrite a i x) [0..n-1]
return a
-- function to find previous y on diagonal k for furthest point
findYP :: MVI1 s -> Int -> Int -> ST s Int
findYP fp k offset = do
let k0 = k+offset-1
k1 = k+offset+1
y0 <- MU.unsafeRead fp k0 >>= \x -> return $ 1+x
y1 <- MU.unsafeRead fp k1
if y0 > y1 then return y0
else return y1
{-#INLINE findYP #-}
gridWalk :: Vector Int32 -> Vector Int32 -> MVI1 s -> Int -> ST s ()
gridWalk a b fp !k = {-#SCC gridWalk #-} do
let !offset = 1+U.length a
!yp <- {-#SCC findYP #-} findYP fp k offset
let xp = yp-k
!len = {-#SCC cmp #-} cmp a b xp yp
y = yp+len
{-#SCC "updateFP" #-} MU.unsafeWrite fp (k+offset) y
{-#INLINE gridWalk #-}
-- As noted in the paper, we find furthest point given diagonal k, and current set of furthest points.
-- The function below executes ct times, and updates furthest point (fp), snake node indices in snake
-- array (snodes), and inserts new snakes in snake array as they are found during furthest point search
findSnakes :: Vector Int32 -> Vector Int32 -> MVI1 s -> Int -> Int -> (Int -> Int -> Int) -> ST s ()
findSnakes a b fp !k !ct !op = {-#SCC findSnakes #-} CM.forM_ [0..ct-1] (\x -> gridWalk a b fp (op k x))
{-#INLINE findSnakes #-}
whileM_ :: ST s Bool -> ST s a -> ST s ()
whileM_ p f = go
where go = do
x <- p
if x
then f >> go
else return ()
-- Helper function for lcs
lcsh :: Vector Int32 -> Vector Int32 -> Int
lcsh a b = runST $ do
let n = U.length a
m = U.length b
delta = m-n
offset=n+1
deloff=m+1 -- index location of diagonal at delta in "furthest point" array
fp <- newVI1 (m+n+3) (-1) -- array of furthest points
p <- newSTRef 0 -- minimum number of deletions for a
whileM_ (MU.unsafeRead fp (deloff) >>= \x -> return $! (x<m))
$ do
n <- readSTRef p
findSnakes a b fp (-n) (delta+n) (+) -- vertical traversal
findSnakes a b fp (delta+n) n (-) -- horizontal traversal
findSnakes a b fp delta 1 (-) -- diagonal traversal
modifySTRef p (+1) -- increment p by 1
-- length of LCS is n-p. p must be decremented by 1 first for correct value of p
readSTRef p >>= \x -> return $ (U.length a)-(x-1)
{-#INLINABLE lcsh #-}
-- Function to find longest common subsequence given unboxed vectors a and b
-- It returns indices of LCS in a and b
lcs :: Vector Int32 -> Vector Int32 -> Int
lcs a b | (U.length a > U.length b) = lcsh b a
| otherwise = lcsh a b
{-#INLINABLE lcs #-}
main = print $ lcs (U.fromList [0..3999]) (U.fromList [3999..7999])
all: install
install:
gcc -O2 miller.c -std=c99 -Wall -c
gcc test.c miller.o -std=c99 -Wall -o test
clean:
rm -rf *.o test
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "miller.h"
//Thread-safe - no global, or local static variables
void vecfree(vec v){ free(v.vec); v.vec=NULL; v.size=0; return;}
size_t int32cmp(const vec a,const vec b,size_t i,size_t j){
size_t i1=i;
int32_t const* a1 = (int32_t const*)a.vec;
int32_t const* b1 = (int32_t const*)b.vec;
for(;(i<a.size) && (j<b.size) && a1[i] == b1[j];i++,j++);
return i-i1;
}
static inline int64_t __attribute__((always_inline)) incr(int64_t k){
return k+1;
}
static inline int64_t __attribute__((always_inline)) decr(int64_t k){
return k-1;
}
//Note: fp, snodes are constant of size m+n+3 - they represent furthest point, snake nodes
static inline void __attribute__((always_inline)) findsnakes(vec a,vec b,int64_t* fp,int64_t k,size_t (*cmp)(vec,vec,size_t,size_t),size_t ct, int64_t (*op)(int64_t)){
int64_t offset = (int64_t) a.size+1;
size_t kp;
int64_t xp,yp,x,y;
for(;0<ct;ct--){
if(fp[k+offset-1] + 1 > fp[k+offset+1]){
kp = k+offset-1;
yp = fp[kp]+1;
}
else{
kp = k+offset+1;
yp = fp[kp];
}
y=yp;x=y-k;xp=x;
x += cmp(a,b,x,y);
y += x-xp; //x-xp=cmp(a,b,x,y)
fp[k+offset] = y;
k=op(k);
}
}
static size_t lcsh(vec a,vec b,size_t (*cmp)(vec,vec,size_t,size_t),bool flip){
size_t n = a.size, m = b.size, delta = m-n, offset = n+1, p=0;
int64_t _m = (int64_t) m;
int64_t* fp = (int64_t*) malloc((m+n+3)*sizeof(int64_t));
for(int i=0;i<m+n+3;i++)fp[i]=-1;
//note since fp is initialized to -1 and m>0, the loop will execute
//at least once. So, p value is at least 0 since it is decremented
//by 1 after exiting the loop
for(;fp[delta+offset] < _m;p++){
findsnakes(a,b,fp,-1*p,cmp,delta+p,incr);
findsnakes(a,b,fp,delta+p,cmp,p,decr);
findsnakes(a,b,fp,delta,cmp,1,decr);
}
p-=1;
return n-p;
}
size_t lcs(vec a,vec b, size_t (*cmp)(vec,vec,size_t,size_t)){
bool flip = a.size > b.size ? 1:0;
size_t res;
if(flip) res=lcsh(b,a,cmp,1);
else res=lcsh(a,b,cmp,0);
return res;
}
#include <inttypes.h>
typedef enum {false,true} bool;
typedef struct{
int64_t p;
int64_t x;
int64_t y;
size_t len;
} snakes;
typedef struct{
size_t size;
void* vec;
}vec;
void vecfree(vec);
size_t int32cmp(vec,vec,size_t,size_t);
size_t lcs(vec,vec,size_t (*)(vec,vec,size_t,size_t));
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "miller.h"
vec* input;
void initc(int size){
input=malloc(2*sizeof(vec));
size_t size1=size;
size_t size2=size1 + 1;
int32_t* i1 = (int32_t*) malloc(size1*sizeof(int32_t));
int32_t* i2 = (int32_t*) malloc(size2*sizeof(int32_t));
for(int i=0;i<size1;i++) i1[i]=(int32_t)i;
for(int i=0;i<size2;i++) i2[i]=(int32_t)i+size-1;
input[0].size=size1;
input[0].vec = i1;
input[1].size=size2;
input[1].vec = i2;
}
int main(){
int res;
initc(4000); //generate inputs a ([0..3999]) and b ([3999..7999])
res=lcs(input[0],input[1],int32cmp); //call lcs - length should be 1
printf("%d\n",res);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment