Skip to content

Instantly share code, notes, and snippets.

@blairdrummond
Created August 14, 2018 17:11
Show Gist options
  • Save blairdrummond/a5f95bd67b9fb44ec507301fe3db76d5 to your computer and use it in GitHub Desktop.
Save blairdrummond/a5f95bd67b9fb44ec507301fe3db76d5 to your computer and use it in GitHub Desktop.
getGenomes function using way too much memory
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE FlexibleInstances #-}
module Main where
-- parsing arguments
import Control.Exception
import Control.Monad
import Data.Semigroup ((<>))
import Options.Applicative
import GHC.IO.Exception
import Prelude hiding (catch, readFile, lines)
import System.IO.Error hiding (catch)
import System.IO
import System.Random
import Data.Foldable
import qualified Data.Text as T
import qualified Data.Text.IO as TIO
-- for the program
-- import Data.Array
import Data.List
import Data.Ord
-- import Data.String
import Data.Char
import Data.Maybe
-- import GHC.Int
-- import qualified Data.Text.Lazy as T
-- import qualified Data.Text.Lazy.IO as TI
-- import qualified Data.Map.Strict as M
{- The data types and functions -}
data Nucleobase = A | C | G | T
deriving (Read, Eq, Show, Ord, Bounded, Enum)
instance Random Nucleobase where
randomR (a, b) g =
case randomR (fromEnum a, fromEnum b) g of
(x, g') -> (toEnum x, g')
random g = randomR (minBound, maxBound) g
fromChar 'A' = A
fromChar 'C' = C
fromChar 'G' = G
fromChar 'T' = T
fromChar x = error $ "Character '" ++ (x:"") ++ "' not valid. A | C | G | T only!"
toChar A = 'A'
toChar C = 'C'
toChar G = 'G'
toChar T = 'T'
op :: Nucleobase -> Nucleobase
op A = T
op T = A
op G = C
op C = G
type Genome = [Nucleobase]
instance {-# OVERLAPPING #-} Show [Nucleobase] where
show = toStr
instance {-# OVERLAPPING #-} Read Genome where
readsPrec _ l = [(map fromChar l,"")]
conjugate :: Genome -> Genome
conjugate g = map op $ reverse g
fromText :: T.Text -> Genome
-- fromText s = foldl' (flip ((:) . fromChar)) [] $ takeWhile (not . isSpace) s
fromText = (T.foldl' (flip ((:) . fromChar)) []) . T.strip
toStr :: Genome -> String
toStr = foldl' (flip ((:) . toChar)) []
{- Count the number of occurance of pat in gene, with overlaps. -}
patternCount [] _ = error "Undefined for empty pattern"
patternCount pat gene = foldl' (\n g -> if pat `isPrefixOf` g then n+1 else n) 0 (tails gene)
skew :: Genome -> [Int]
skew l = scanl' gc 0 l
where
gc acc C = acc - 1
gc acc G = acc + 1
gc acc _ = acc
hamming s t = foldl' (\(!acc) b -> if b then (acc + 1) else acc) 0 $ zipWith (/=) s t
hammingClose d s t = hamming s t <= d
factors k s = take (length s - k + 1) $ map (take k) $ tails s
neighbourhood :: Int -> Genome -> [Genome]
neighbourhood 0 w = [w]
neighbourhood d (x:xs) = nub $ [ x : l | l <- neighbourhood d xs ]
++ [ a : l | l <- neighbourhood (d-1) xs, a <- [A,C,G,T], a /= x ]
neighbourhood n [] = [[]]
motifs :: Int -> Int -> [Genome] -> [Genome]
motifs k d ls = let ns = [ nub $ concatMap (neighbourhood d) (nub $ factors k l) | l <- ls ]
in foldl1' intersect ns
median :: Int -> [Genome] -> (Int, [Genome])
median k ls = let batch = head $ filter (not . null) [ motifs k d ls | d <- [0..] ]
pairs = [ (dist b, b) | b <- batch ]
low = fst $ minimumBy score pairs
in (low, map snd $ filter (\(a,_) -> a == low) pairs)
where
score = comparing fst
dist a = sum [ minimum $ map (hamming a) $ factors k l | l <- ls ]
profileMax :: Genome -> Int -> [[Double]] -> Genome
profileMax g k m = fst $ foldl1' (\(a,pa) (b,pb) -> if pa >= pb then (a,pa) else (b,pb)) [ (f, profile f) | f <- factors k g ]
where
profile f = foldl1' (*) $ zipWith (\col a -> col !! (fromEnum a)) (transpose m) f
motifToProfile :: [Genome] -> [[Double]]
motifToProfile gs = let ls = map (foldl' count [0,0,0,0]) $ transpose gs
in transpose [ [ (fromIntegral r) / (fromIntegral $ sum l) | r <- l ] | l <- ls ]
where
count !(a:c:g:t:[]) A = ((a+1):c:g:t:[])
count !(a:c:g:t:[]) C = (a:(c+1):g:t:[])
count !(a:c:g:t:[]) G = (a:c:(g+1):t:[])
count !(a:c:g:t:[]) T = (a:c:g:(t+1):[])
motifToProfileL :: [Genome] -> [[Double]]
motifToProfileL gs = let ls = map (foldl' count [1,1,1,1]) $ transpose gs
in transpose [ [ (fromIntegral r) / (fromIntegral $ sum l) | r <- l ] | l <- ls ]
where
count !(a:c:g:t:[]) A = ((a+1):c:g:t:[])
count !(a:c:g:t:[]) C = (a:(c+1):g:t:[])
count !(a:c:g:t:[]) G = (a:c:(g+1):t:[])
count !(a:c:g:t:[]) T = (a:c:g:(t+1):[])
greedyMotif :: Int -> [Genome] -> [Genome]
greedyMotif k [] = []
greedyMotif k (g:gs) = let ms = [ foldl' (\a b -> (profileMax b k (motifToProfile a)):a) [f] gs | f <- factors k g ]
in reverse $ minimumBy (comparing (profileScore . motifToProfileL)) ms
entropy prof = foldl1' (\acc p -> if p <= 0 then acc else acc - p * (logBase 2.0 p)) (concat prof)
profileScore prof = sum $ [ 1 - maximum c | c <- transpose prof ]
{- Random sequence -}
randomSeq l seed = let g = mkStdGen seed
in take l $ (randoms g :: [Nucleobase])
randomMotifSearch k gs g = let (g', ms) = mapAccumL randomSelect g $ map (factors k) gs
msseq = map (\(a,b) -> (a, entropy b)) $ motifSequence k gs ms
pairs = zip msseq (tail msseq)
in case find (\((_,ha),(_,hb)) -> ha <= hb) pairs of
Just (x,_) -> (x, g')
Nothing -> (last msseq, g')
where
randomSelect g !l = let (i, g_) = randomR (0, length l - 1) g
in (g_, l !! i)
motifSequence k gs ms = iterate' f (ms, motifToProfile ms)
where
f (!ms, !p) = let ms' = map (\x -> profileMax x k p) gs
in (ms', motifToProfile ms')
{- Parsing arguments and running main -}
data Args = Factors { k :: Int, file :: Maybe FilePath }
| Neighbours { d :: Int, file :: Maybe FilePath }
| HammingDist { s :: Genome, t :: Genome }
| Skew { file :: Maybe FilePath, min :: Bool }
| PatternCount { p :: Genome, file :: Maybe FilePath }
| ReverseConjugate { file :: Maybe FilePath }
| RandomSequence { l :: Int, seed :: Maybe Int }
| Motifs { k :: Int, d :: Int, file :: Maybe FilePath }
| Median { k :: Int, file :: Maybe FilePath }
| Profile { file :: Maybe FilePath }
| GreedyMotif { k :: Int, file :: Maybe FilePath }
| RandomMotif { k :: Int, seed :: Maybe Int, trials :: Maybe Int, file :: Maybe FilePath }
| Entropy { file :: Maybe FilePath }
factorsParse = Factors
<$> argument auto (help "Extract all l-mers" <> metavar "LENGTH")
<*> (optional $ argument str (metavar "FILENAME"
<> help "Input file (or stdin if not specified)" ))
neighboursParse = Neighbours
<$> argument auto (help "Form a d-neighbourhood about a word" <> metavar "DISTANCE")
<*> (optional $ argument str
(metavar "FILENAME"
<> help "Input file (or stdin if not specified)" ))
hammingParse = HammingDist
<$> argument auto
(help "Left Genome" <> metavar "LEFT")
<*> argument auto
(help "Right Genome" <> metavar "RIGHT")
skewParse = Skew
<$> (optional $ strOption
( long "file"
<> short 'f'
<> metavar "FILENAME"
<> help "Input file (or stdin if not specified)" ))
<*> switch
( long "min"
<> short 'm'
<> help "Find location(s) of minimum" )
patternParse = PatternCount
<$> argument auto
(help "Pattern to count occurances of" <> metavar "PATTERN")
<*> (optional $ argument str
(metavar "FILENAME"
<> help "Input file (or stdin if not specified)" ))
reverseParse = ReverseConjugate
<$> (optional $ argument str
(metavar "FILENAME"
<> help "Input file (or stdin if not specified)" ))
randomParse = RandomSequence
<$> argument auto (help "Random sequence of length LENGTH" <> metavar "LENGTH")
<*> (optional $ argument auto
(metavar "SEED"
<> help "Seed value (or 0 if not specified)"))
motifsParse = Motifs
<$> argument auto (help "k-mer motifs" <> metavar "LENGTH")
<*> argument auto (help "at most d mutations" <> metavar "DISTANCE")
<*> (optional $ argument str
(metavar "FILENAME"
<> help "Input file (or stdin if not specified)" ))
medianParse = Median
<$> argument auto (help "Consider k-mer median" <> metavar "LENGTH")
<*> (optional $ argument str
(metavar "FILENAME"
<> help "Input file (or stdin if not specified)" ))
profileParse = Profile
<$> (optional $ argument str
(metavar "FILENAME"
<> help "Input file (or stdin if not specified)" ))
greedyMotifParse = GreedyMotif
<$> argument auto (help "Consider k-mer medians" <> metavar "LENGTH")
<*> (optional $ argument str
(metavar "FILENAME"
<> help "Input file (or stdin if not specified)" ))
randomMotifParse = RandomMotif
<$> argument auto (help "Consider k-mer medians" <> metavar "LENGTH")
<*> (optional $ option auto
(metavar "TRIALS"
<> long "trials"
<> short 't'
<> help "Number of trials (or 1 if not specified)"))
<*> (optional $ option auto
(metavar "SEED"
<> long "seed"
<> short 's'
<> help "Seed value (or 1 if not specified)"))
<*> (optional $ argument str
(metavar "FILENAME"
<> help "Input file (or stdin if not specified)" ))
entropyParse = Entropy
<$> (optional $ argument str
(metavar "FILENAME"
<> help "Input file (or stdin if not specified)" ))
data ProfileSystem = ProfileSystem { profile_gene :: Genome, profile_k :: Int, profile_matrix :: [[Double]] }
instance Read ProfileSystem where
readsPrec _ s = let (g':i':ls) = lines s
in [(ProfileSystem (map fromChar g') (read i') [ map read (words l) | l <- ls ],"")]
argparse :: Parser Args
argparse = subparser (
command "factors" (info factorsParse (progDesc "Extract all k-mers"))
<> command "neighbourhood" (info neighboursParse (progDesc "Return d-neighbourhood of input words"))
<> command "hamming" (info hammingParse (progDesc "Compute hamming distance between two strings"))
<> command "skew" (info skewParse (progDesc "Compute Skew of Genome"))
<> command "count" (info patternParse (progDesc "Count number of occurances of PATTERN"))
<> command "reverse" (info reverseParse (progDesc "Take the reverse conjugate of input"))
<> command "random" (info randomParse (progDesc "Generate a random sequence of fixed length"))
<> command "motifs" (info motifsParse (progDesc "Find (k,d) motifs of input strings"))
<> command "median" (info medianParse (progDesc "Find k-mer closest to all input strings"))
<> command "profile" (info profileParse (progDesc "Stupid profile algorithm."))
<> command "greedymotif" (info greedyMotifParse (progDesc "Stupid motifs algorithm."))
<> command "randommotif" (info randomMotifParse (progDesc "Randomly construct motifs"))
<> command "entropy" (info entropyParse (progDesc "Compute entropy (log2)"))
)
getGenomes :: Maybe FilePath -> IO [Genome]
getGenomes path = do
c <- case path of
Nothing -> try $ do
term <- hIsTerminalDevice stdin
if not term
then liftM (map fromText) $ liftM T.lines $ TIO.hGetContents stdin
else do
throwIO (IOError (Just stdin) NoSuchThing "" "Error: No pipe from stdin\n" Nothing Nothing)
return []
Just path -> try $ do
s <- TIO.readFile path
return $ map fromText (T.lines s)
case c of
Left (IOError (Just stdin) _ _ m _ _) -> do
hPutStr stderr m
return []
Left ex -> do
let err = show (ex :: IOException)
hPutStr stderr ("Error:\n " ++ (show ex) ++ "\n")
throwIO ex
return []
Right content -> return content
main :: IO ()
main = do
args <- customExecParser (prefs showHelpOnEmpty) opts
case args of
Factors k path -> do
gs <- getGenomes path
mapM_ (\l -> mapM_ (putStrLn . show) $ factors k l) gs
Neighbours d path -> do
gs <- getGenomes path
mapM_ (\l -> mapM_ (putStrLn . show) $ neighbourhood d l) gs
HammingDist s t -> print $ hamming s t
Skew path min -> do
gs <- getGenomes path
mapM_ (\l -> if l == [] then return () else
if min
then do
let s = skew l
let m = minimum s
putStrLn ("Minimum skew is " ++ (show m))
putStr "At: "
putStr $ intercalate ", " $ map show (elemIndices m s)
putStrLn ""
else print $ skew l) gs
PatternCount p path -> do
gs <- getGenomes path
mapM_ (\l -> print $ patternCount p l) gs
ReverseConjugate path -> do
gs <- getGenomes path
mapM_ (\l -> if l == []
then return ()
else print $ conjugate l) gs
RandomSequence l Nothing -> print $ randomSeq l 0
RandomSequence l (Just s) -> print $ randomSeq l s
Motifs k d path -> do
gs <- getGenomes path
if gs == []
then hPutStr stderr ("No Input Strings!")
else mapM_ (putStrLn . show) (motifs k d gs)
Median k path -> do
gs <- getGenomes path
if gs == []
then hPutStr stderr ("No Input Strings!")
else let (score, motifs) = median k gs
in putStrLn ("Motifs: " ++ show motifs ++ ", score: " ++ show score)
Profile path -> do
c <- case path of
Nothing -> try $ do
term <- hIsTerminalDevice stdin
if not term
then TIO.hGetContents stdin
else do
throwIO (IOError (Just stdin) NoSuchThing "" "Error: No pipe from stdin\n" Nothing Nothing)
return T.empty
Just path -> try $ TIO.readFile path
str <- case c of
Left (IOError (Just stdin) _ _ m _ _) -> do
hPutStr stderr m
return T.empty
Left ex -> do
let err = show (ex :: IOException)
hPutStr stderr ("Error:\n " ++ (show ex) ++ "\n")
throwIO ex
Right content -> return content
let ProfileSystem g i m = read (T.unpack str)
print $ profileMax g i m
GreedyMotif k path -> do
gs <- getGenomes path
if gs == []
then hPutStr stderr ("No Input Strings!")
else let motifs = greedyMotif k gs
in mapM_ (putStrLn . show) motifs
RandomMotif k trials seed path -> do
gs <- getGenomes path
let s = case seed of
Nothing -> 1
Just n -> n
let g = mkStdGen s
let t = case trials of
Nothing -> 1
Just n -> n
if gs == []
then hPutStr stderr ("No Input Strings!")
else let ms = take t $ unfoldr (\g_ -> Just (randomMotifSearch k gs g_)) g
(m,h) = foldl1' (\a b -> case comparing snd a b of
LT -> a
GT -> b
EQ -> a) ms
in do mapM_ (putStrLn . show) m
putStrLn ""
putStrLn ("Entropy: " ++ show h)
Entropy path -> do
gs <- getGenomes path
print (entropy $ motifToProfile gs)
where
opts = info (argparse <**> helper)
( fullDesc
<> progDesc "A collection of functions for DNA stuff"
<> header "dna: A collection of bioinformatics algorithms" )
example :: [Genome]
example = [[C,G,C,C,C,C,T,C,T,C,G,G,G,G,G,T,G,T,T,C,A,G,T,A,A,A,C,G,G,C,C,A],
[G,G,G,C,G,A,G,G,T,A,T,G,T,G,T,A,A,G,T,G,C,C,A,A,G,G,T,G,C,C,A,G],
[T,A,G,T,A,C,C,G,A,G,A,C,C,G,A,A,A,G,A,A,G,T,A,T,A,C,A,G,G,C,G,T],
[T,A,G,A,T,C,A,A,G,T,T,T,C,A,G,G,T,G,C,A,C,G,T,C,G,G,T,G,A,A,C,C],
[A,A,T,C,C,A,C,C,A,G,C,T,C,C,A,C,G,T,G,C,A,A,T,G,T,T,G,G,C,C,T,A] ]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment