Skip to content

Instantly share code, notes, and snippets.

@ksf
Created March 30, 2010 15:25
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save ksf/349196 to your computer and use it in GitHub Desktop.
{-# OPTIONS_GHC -fvia-C -O2 -optc-O2 -optc-ffast-math -fexcess-precision -funbox-strict-fields #-}
module Data where
iubs, homs :: [(Char,Float)]
iubs = [('a',0.27),('c',0.12),('g',0.12),('t',0.27),('B',0.02)
,('D',0.02),('H',0.02),('K',0.02),('M',0.02),('N',0.02)
,('R',0.02),('S',0.02),('V',0.02),('W',0.02),('Y',0.02)]
homs = [('a',0.3029549426680),('c',0.1979883004921)
,('g',0.1975473066391),('t',0.3015094502008)]
alu = "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\
\GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\
\CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\
\ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\
\GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\
\AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\
\AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"
{-# OPTIONS_GHC -fvia-C -O2 -optc-O2 -optc-ffast-math -fexcess-precision -funbox-strict-fields #-}
{-# LANGUAGE TemplateHaskell, MagicHash, BangPatterns #-}
module Fasta where
import qualified Data.ByteString as S
import Data.ByteString.Internal
import Language.Haskell.TH
import GHC.Word
import GHC.Base
import Foreign
import System.IO
import Control.Arrow
import Debug.Trace
import Unsafe.Coerce
bufSize = 4096
printSeq dat = [|
let go seed n buf = {-# SCC "bufs" #-} bufs buf n seed
bufs buf 0 seed = return seed
bufs buf i seed
| seqT seed = do
seed' <- {-# SCC "lines" #-} lines buf m seed
hPutBuf stdout buf m'
bufs buf (i-m) seed'
where
m = min i ((bufSize `div` (lineL+1)) * lineL)
m' = (m+(m `quot` lineL)+if m `rem` lineL > 0 then 1 else 0)
lines buf 0 seed = return seed
lines buf i seed
| seqT seed = {-# SCC "bytes" #-} bytes eol buf seed >>= lines (incPtr eol) (i-m)
where
m = min i lineL
eol = buf `plusPtr` m
bytes eol rp seed
| rp == eol = pokeNl eol >> return seed
| seqT seed = poke rp w >> bytes eol (incPtr rp) seed'
where (w, seed') = {-# SCC "choose" #-} choose seed
lineL = 60
seqT = flip seq True
incPtr = flip plusPtr 1
pokeNl ptr = poke ptr (c2w '\n')
choose = first (\p -> ($(lookupNuc dat) p)) . rand
in \n seed -> allocaBytes bufSize (go seed n) |]
writeFasta label title n s = do
putStrLn $ ">" ++ label ++ " " ++ title
let (t:ts) = cycle . return . S.pack . map c2w $ s
go ts t n
where
go ss s n
| l60 && n60 = S.putStrLn l >> go ss r (n-60)
| n60 = S.putStr s >> S.putStrLn a >> go (tail ss) b (n-60)
| n <= ln = S.putStrLn (S.take n s)
| otherwise = S.putStr s >> S.putStrLn (S.take (n-ln) (head ss))
where
ln = S.length s
l60 = ln >= 60
n60 = n >= 60
(l,r) = S.splitAt 60 s
(a,b) = S.splitAt (60-ln) (head ss)
rand :: Int -> (Float, Int)
rand seed = (rand, seed') where
im = 139968
ia = 3877
ic = 29573
seed' = (seed * ia + ic) `rem` im
rand = fromIntegral seed' / fromIntegral im
data T b l
= B b (T b l) (T b l)
| L l
lookupNuc :: [(Char, Float)] -> ExpQ
lookupNuc = lam1E pP . go . snd . ann . build . acc 0 where
acc a ((b, f):xs) = L ((c2w b), f') : acc f' xs where f' = a + f
acc a [] = []
build [x] = x
build l@(x:_) = build (f l) where
f (l:h:xs) = B () l h: f xs
f xs = xs
ann (B () (L (lw, lf)) (L (hw, hf))) = (hf, B lf (L lw) (L hw))
ann (B () l (L (hw, hf))) = (hf, B lf l' (L hw))
where (lf, l') = ann l
ann (B () l h ) = (hf, B lf l' h')
where (lf, l') = ann l
(hf, h') = ann h
pP = varP . mkName $ "p"
pE = varE . mkName $ "p"
ltE = varE . mkName $ "<"
flE = litE . rationalL . toRational
w8E = litE . integerL . fromIntegral
go (L w) = w8E w
go (B f l h) = condE (appE (appE ltE pE) (flE f)) (go l) (go h)
--{-# OPTIONS_GHC -O2 -fexcess-precision #-}
--{-# OPTIONS_GHC -fvia-C -O2 -optc-O2 -optc-ffast-math -fexcess-precision #-}
{-# OPTIONS_GHC -fvia-C -O2 -optc-O2 -optc-ffast-math -optc-march=native -fexcess-precision -funbox-strict-fields #-}
{-# LANGUAGE BangPatterns, TemplateHaskell, MagicHash #-}
--
-- The Computer Language Benchmarks Game
-- http://shootout.alioth.debian.org/
--
-- The Noble League of Haskell Performance Knights
-- http://www.haskell.org/haskellwiki/Shootout
--
-- Contributed by Don Stewart
-- A lazy bytestring solution.
-- Unnecessary strictness annotations removed by Sterling Clover 2/08
-- Deburring and optimized lookup by Achim Schneider 11/09
--
-- Add:
-- -funfolding-use-threshold=40
--
import System
import Data.Word
import Control.Arrow
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Char8 as C (pack,unfoldr)
import qualified Data.ByteString as S
import Data.ByteString.Internal (c2w)
import Data.ByteString.Unsafe
import Fasta
import Data
import GHC.Word
import GHC.Base
import System.IO
import Foreign
main = getArgs >>= readIO . head >>= fasta
fasta n = writeFasta "ONE" "Homo sapiens alu" (n*2) alu
>> randFasta "TWO" "IUB ambiguity codes" (n*3) $(printSeq iubs) 42
>>= randFasta "THREE" "Homo sapiens frequency" (n*5) $(printSeq homs)
>> return ()
-------------------------------------------------------------------------
--
-- Lazily unfold the randomised dna sequences
--
randFasta l t n f g = putStrLn (">" ++ l ++ " " ++ t) >> f n g
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment