Skip to content

Instantly share code, notes, and snippets.

/Diff.hs Secret

Created June 14, 2013 22:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/fa94652fa8a9aea8f68d to your computer and use it in GitHub Desktop.
Save anonymous/fa94652fa8a9aea8f68d to your computer and use it in GitHub Desktop.
Diff.hs - compile with -O2 -fllvm miller.c and test.c: Makefile attached
{-# LANGUAGE BangPatterns #-}
-- |
-- Implementation of "An O(NP) Sequence Comparison Algorithm" by Sun Wu, Udi
-- Manber, Gene Myers and Web Miller
-- Note: P=(D-(M-N))/2 where D is shortest edit distance, M,N sequence lengths,
-- M >= N
--
import Data.Vector.Unboxed as U hiding (mapM_)
import Data.Vector.Unboxed.Mutable as MU
import Control.Monad.ST as ST
import Control.Monad.Primitive (PrimState)
import Control.Monad as CM (when,forM_)
import Data.Int
import Data.Array.ST
type MVI1 s = MVector (PrimState (ST s)) Int
cmp :: Vector Int32 -> U.Vector Int32 -> Int -> Int -> Int
cmp a b i j = go 0 i j
where
n = U.length a
m = U.length b
go len i' j'| (i'<n) && (j'<m) && (unsafeIndex a i') == (unsafeIndex b j') = go (len+1) (i'+1) (j'+1)
| otherwise = len
{-#INLINE cmp #-}
-- function to find previous y on diagonal k for furthest point
findYP :: MVI1 s -> Int -> Int -> ST s Int
findYP fp k offset = do
y0 <- MU.unsafeRead fp (k+offset-1) >>= \x -> return $ 1+x
y1 <- MU.unsafeRead fp (k+offset+1)
if y0 > y1 then return y0
else return y1
{-#INLINE findYP #-}
-- Helper function for lcs
lcsh :: Vector Int32 -> Vector Int32 -> Int
lcsh a b = runST $ do
let n = U.length a
m = U.length b
fp <- MU.replicate (m+n+3) (-1) -- array of furthest points
let delta = m-n
offset=n+1
deloff=m+1 -- index location of diagonal at delta in "furthest point" array
findSnakes k ct op = go 0
where
go x
| x < ct = do
let k' = op k x
yp <- findYP fp k' offset
MU.unsafeWrite fp (k'+offset) (yp + (cmp a b (yp-k') yp))
go (x+1)
| otherwise = return ()
{-#INLINE findSnakes #-}
loop i = do
fpd <- MU.unsafeRead fp deloff
if fpd < m then
findSnakes (-i) (delta+i) (+) >>
findSnakes (delta+i) i (-) >>
findSnakes delta 1 (-) >>
loop (i+1)
else return i
{-#INLINE loop #-}
p <- loop 0
return $ n-(p-1)
{-#INLINABLE lcsh #-}
-- Function to find longest common subsequence given unboxed vectors a and b
-- It returns indices of LCS in a and b
lcs :: Vector Int32 -> Vector Int32 -> Int
lcs a b | (U.length a > U.length b) = lcsh b a
| otherwise = lcsh a b
{-#INLINABLE lcs #-}
main = print $ lcs (U.fromList [0..3999]) (U.fromList [3999..7999])
all: install
install:
gcc -O2 miller.c -std=c99 -Wall -c
gcc test.c miller.o -std=c99 -Wall -o test
clean:
rm -rf *.o test
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "miller.h"
//Thread-safe - no global, or local static variables
void vecfree(vec v){ free(v.vec); v.vec=NULL; v.size=0; return;}
size_t int32cmp(const vec a,const vec b,size_t i,size_t j){
size_t i1=i;
int32_t const* a1 = (int32_t const*)a.vec;
int32_t const* b1 = (int32_t const*)b.vec;
for(;(i<a.size) && (j<b.size) && a1[i] == b1[j];i++,j++);
return i-i1;
}
static inline int64_t __attribute__((always_inline)) incr(int64_t k){
return k+1;
}
static inline int64_t __attribute__((always_inline)) decr(int64_t k){
return k-1;
}
//Note: fp, snodes are constant of size m+n+3 - they represent furthest point, snake nodes
static inline void __attribute__((always_inline)) findsnakes(vec a,vec b,int64_t* fp,int64_t k,size_t (*cmp)(vec,vec,size_t,size_t),size_t ct, int64_t (*op)(int64_t)){
int64_t offset = (int64_t) a.size+1;
size_t kp;
int64_t xp,yp,x,y;
for(;0<ct;ct--){
if(fp[k+offset-1] + 1 > fp[k+offset+1]){
kp = k+offset-1;
yp = fp[kp]+1;
}
else{
kp = k+offset+1;
yp = fp[kp];
}
y=yp;x=y-k;xp=x;
x += cmp(a,b,x,y);
y += x-xp; //x-xp=cmp(a,b,x,y)
fp[k+offset] = y;
k=op(k);
}
}
static size_t lcsh(vec a,vec b,size_t (*cmp)(vec,vec,size_t,size_t),bool flip){
size_t n = a.size, m = b.size, delta = m-n, offset = n+1, p=0;
int64_t _m = (int64_t) m;
int64_t* fp = (int64_t*) malloc((m+n+3)*sizeof(int64_t));
for(int i=0;i<m+n+3;i++)fp[i]=-1;
//note since fp is initialized to -1 and m>0, the loop will execute
//at least once. So, p value is at least 0 since it is decremented
//by 1 after exiting the loop
for(;fp[delta+offset] < _m;p++){
findsnakes(a,b,fp,-1*p,cmp,delta+p,incr);
findsnakes(a,b,fp,delta+p,cmp,p,decr);
findsnakes(a,b,fp,delta,cmp,1,decr);
}
p-=1;
return n-p;
}
size_t lcs(vec a,vec b, size_t (*cmp)(vec,vec,size_t,size_t)){
bool flip = a.size > b.size ? 1:0;
size_t res;
if(flip) res=lcsh(b,a,cmp,1);
else res=lcsh(a,b,cmp,0);
return res;
}
#include <inttypes.h>
typedef enum {false,true} bool;
typedef struct{
int64_t p;
int64_t x;
int64_t y;
size_t len;
} snakes;
typedef struct{
size_t size;
void* vec;
}vec;
void vecfree(vec);
size_t int32cmp(vec,vec,size_t,size_t);
size_t lcs(vec,vec,size_t (*)(vec,vec,size_t,size_t));
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "miller.h"
vec* input;
void initc(int size){
input=malloc(2*sizeof(vec));
size_t size1=size;
size_t size2=size1 + 1;
int32_t* i1 = (int32_t*) malloc(size1*sizeof(int32_t));
int32_t* i2 = (int32_t*) malloc(size2*sizeof(int32_t));
for(int i=0;i<size1;i++) i1[i]=(int32_t)i;
for(int i=0;i<size2;i++) i2[i]=(int32_t)i+size-1;
input[0].size=size1;
input[0].vec = i1;
input[1].size=size2;
input[1].vec = i2;
}
int main(){
int res;
initc(4000); //generate inputs a ([0..3999]) and b ([3999..7999])
res=lcs(input[0],input[1],int32cmp); //call lcs - length should be 1
printf("%d\n",res);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment