Created
January 27, 2015 18:29
-
-
Save fatho/cadedb556df1cfcfdda6 to your computer and use it in GitHub Desktop.
Haskell solution to the reverse-complement (http://benchmarksgame.alioth.debian.org/u32/performance.php?test=revcomp) language game benchmark using conduit.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE LambdaCase, OverloadedStrings, RankNTypes, MultiWayIf, BangPatterns #-} | |
import Control.Monad | |
import Control.Monad.IO.Class | |
import qualified Data.Char as Char | |
import Data.Word | |
import Data.Monoid | |
import qualified Foreign.Storable as S | |
import Foreign.C.String | |
import qualified Data.Vector.Unboxed as VU | |
import qualified Data.Vector.Unboxed.Mutable as VUM | |
import qualified Data.ByteString as BS | |
import qualified Data.ByteString.Unsafe as BSU | |
import qualified Data.ByteString.Lazy as BL | |
import qualified Data.ByteString.Builder as BB | |
import System.IO | |
import Data.Conduit | |
import qualified Data.Conduit.Binary as CB | |
fastaTitleMarker :: Word8 | |
fastaTitleMarker = fromIntegral $ Char.ord '>' | |
newLine :: Word8 | |
newLine = fromIntegral $ Char.ord '\n' | |
-- | Provides a fast lookup of the complement of a nucleic acid. | |
reverseComplement :: Word8 -> Word8 | |
reverseComplement idx = VU.unsafeIndex lookupTable (fromIntegral idx) where | |
src = "\nAaCcGgTtUuMmRrWwSsYyKkVvHhDdBbNn" | |
dst = "\nTTGGCCAAAAKKYYWWSSRRMMBBDDHHVVNN" | |
-- one time creation of lookup table | |
lookupTable = VU.create $ do | |
vec <- VUM.new 256 | |
forM_ (BS.zip src dst) $ \(s,d) -> do | |
VUM.write vec (fromIntegral s) d | |
return vec | |
-- | Accumulates data until the marker is found, then yields all accumulated bytes downstream. | |
-- ATTENTION: This needs at least as much memory as the largest chunk. | |
chunkedBy :: Monad m => Word8 -> Conduit BS.ByteString m BS.ByteString | |
chunkedBy marker = go mempty where | |
go builder = await >>= \case | |
Just chunk | |
| not $ BS.null chunk -> do | |
case BS.elemIndex marker (BS.tail chunk) of | |
Nothing -> go (builder <> BB.byteString chunk) | |
Just idx -> do | |
let (this,left) = BS.splitAt (idx + 1) chunk | |
let result = builder <> BB.byteString this | |
yield $ BL.toStrict $ BB.toLazyByteString result | |
go $ BB.byteString left | |
_ -> yield $ BL.toStrict $ BB.toLazyByteString builder | |
-- | Assumes that the received values are Sequences in the FASTA format. | |
-- Computes the reverse-complement UNSAFELY in-place. | |
unsafeReverseComp :: MonadIO m => Conduit BS.ByteString m BS.ByteString | |
unsafeReverseComp = awaitForever $ \chunk -> do | |
case BS.elemIndex newLine chunk of | |
Nothing -> yield chunk | |
Just dataStart -> do | |
liftIO $ BSU.unsafeUseAsCStringLen chunk (cstringRevComp dataStart) | |
yield chunk | |
-- | Takes a CStringLen containing DNA data and computes it's reverse complement. | |
cstringRevComp :: Int -> CStringLen -> IO () | |
cstringRevComp start (cstr, len) = go start (len - 1) where | |
go !l !r | |
| l < r = do | |
lc <- S.peekByteOff cstr l | |
rc <- S.peekByteOff cstr r | |
if | lc == newLine -> go (l+1) r | |
| rc == newLine -> go l (r-1) | |
| otherwise -> do | |
S.pokeByteOff cstr l (reverseComplement rc) | |
S.pokeByteOff cstr r (reverseComplement lc) | |
go (l+1) (r-1) | |
| otherwise = return () | |
main :: IO () | |
main = CB.sourceHandle stdin | |
$= chunkedBy fastaTitleMarker | |
$= unsafeReverseComp | |
$$ CB.sinkHandle stdout |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment