Skip to content

Instantly share code, notes, and snippets.

@fatho
Created January 27, 2015 18:29
Show Gist options
  • Save fatho/cadedb556df1cfcfdda6 to your computer and use it in GitHub Desktop.
Save fatho/cadedb556df1cfcfdda6 to your computer and use it in GitHub Desktop.
Haskell solution to the reverse-complement (http://benchmarksgame.alioth.debian.org/u32/performance.php?test=revcomp) language game benchmark using conduit.
{-# LANGUAGE LambdaCase, OverloadedStrings, RankNTypes, MultiWayIf, BangPatterns #-}
import Control.Monad
import Control.Monad.IO.Class
import qualified Data.Char as Char
import Data.Word
import Data.Monoid
import qualified Foreign.Storable as S
import Foreign.C.String
import qualified Data.Vector.Unboxed as VU
import qualified Data.Vector.Unboxed.Mutable as VUM
import qualified Data.ByteString as BS
import qualified Data.ByteString.Unsafe as BSU
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString.Builder as BB
import System.IO
import Data.Conduit
import qualified Data.Conduit.Binary as CB
fastaTitleMarker :: Word8
fastaTitleMarker = fromIntegral $ Char.ord '>'
newLine :: Word8
newLine = fromIntegral $ Char.ord '\n'
-- | Provides a fast lookup of the complement of a nucleic acid.
reverseComplement :: Word8 -> Word8
reverseComplement idx = VU.unsafeIndex lookupTable (fromIntegral idx) where
src = "\nAaCcGgTtUuMmRrWwSsYyKkVvHhDdBbNn"
dst = "\nTTGGCCAAAAKKYYWWSSRRMMBBDDHHVVNN"
-- one time creation of lookup table
lookupTable = VU.create $ do
vec <- VUM.new 256
forM_ (BS.zip src dst) $ \(s,d) -> do
VUM.write vec (fromIntegral s) d
return vec
-- | Accumulates data until the marker is found, then yields all accumulated bytes downstream.
-- ATTENTION: This needs at least as much memory as the largest chunk.
chunkedBy :: Monad m => Word8 -> Conduit BS.ByteString m BS.ByteString
chunkedBy marker = go mempty where
go builder = await >>= \case
Just chunk
| not $ BS.null chunk -> do
case BS.elemIndex marker (BS.tail chunk) of
Nothing -> go (builder <> BB.byteString chunk)
Just idx -> do
let (this,left) = BS.splitAt (idx + 1) chunk
let result = builder <> BB.byteString this
yield $ BL.toStrict $ BB.toLazyByteString result
go $ BB.byteString left
_ -> yield $ BL.toStrict $ BB.toLazyByteString builder
-- | Assumes that the received values are Sequences in the FASTA format.
-- Computes the reverse-complement UNSAFELY in-place.
unsafeReverseComp :: MonadIO m => Conduit BS.ByteString m BS.ByteString
unsafeReverseComp = awaitForever $ \chunk -> do
case BS.elemIndex newLine chunk of
Nothing -> yield chunk
Just dataStart -> do
liftIO $ BSU.unsafeUseAsCStringLen chunk (cstringRevComp dataStart)
yield chunk
-- | Takes a CStringLen containing DNA data and computes it's reverse complement.
cstringRevComp :: Int -> CStringLen -> IO ()
cstringRevComp start (cstr, len) = go start (len - 1) where
go !l !r
| l < r = do
lc <- S.peekByteOff cstr l
rc <- S.peekByteOff cstr r
if | lc == newLine -> go (l+1) r
| rc == newLine -> go l (r-1)
| otherwise -> do
S.pokeByteOff cstr l (reverseComplement rc)
S.pokeByteOff cstr r (reverseComplement lc)
go (l+1) (r-1)
| otherwise = return ()
main :: IO ()
main = CB.sourceHandle stdin
$= chunkedBy fastaTitleMarker
$= unsafeReverseComp
$$ CB.sinkHandle stdout
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment