Created
August 14, 2018 17:11
-
-
Save blairdrummond/a5f95bd67b9fb44ec507301fe3db76d5 to your computer and use it in GitHub Desktop.
getGenomes function using way too much memory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE BangPatterns #-} | |
{-# LANGUAGE FlexibleInstances #-} | |
module Main where | |
-- parsing arguments | |
import Control.Exception | |
import Control.Monad | |
import Data.Semigroup ((<>)) | |
import Options.Applicative | |
import GHC.IO.Exception | |
import Prelude hiding (catch, readFile, lines) | |
import System.IO.Error hiding (catch) | |
import System.IO | |
import System.Random | |
import Data.Foldable | |
import qualified Data.Text as T | |
import qualified Data.Text.IO as TIO | |
-- for the program | |
-- import Data.Array | |
import Data.List | |
import Data.Ord | |
-- import Data.String | |
import Data.Char | |
import Data.Maybe | |
-- import GHC.Int | |
-- import qualified Data.Text.Lazy as T | |
-- import qualified Data.Text.Lazy.IO as TI | |
-- import qualified Data.Map.Strict as M | |
{- The data types and functions -} | |
data Nucleobase = A | C | G | T | |
deriving (Read, Eq, Show, Ord, Bounded, Enum) | |
instance Random Nucleobase where | |
randomR (a, b) g = | |
case randomR (fromEnum a, fromEnum b) g of | |
(x, g') -> (toEnum x, g') | |
random g = randomR (minBound, maxBound) g | |
fromChar 'A' = A | |
fromChar 'C' = C | |
fromChar 'G' = G | |
fromChar 'T' = T | |
fromChar x = error $ "Character '" ++ (x:"") ++ "' not valid. A | C | G | T only!" | |
toChar A = 'A' | |
toChar C = 'C' | |
toChar G = 'G' | |
toChar T = 'T' | |
op :: Nucleobase -> Nucleobase | |
op A = T | |
op T = A | |
op G = C | |
op C = G | |
type Genome = [Nucleobase] | |
instance {-# OVERLAPPING #-} Show [Nucleobase] where | |
show = toStr | |
instance {-# OVERLAPPING #-} Read Genome where | |
readsPrec _ l = [(map fromChar l,"")] | |
conjugate :: Genome -> Genome | |
conjugate g = map op $ reverse g | |
fromText :: T.Text -> Genome | |
-- fromText s = foldl' (flip ((:) . fromChar)) [] $ takeWhile (not . isSpace) s | |
fromText = (T.foldl' (flip ((:) . fromChar)) []) . T.strip | |
toStr :: Genome -> String | |
toStr = foldl' (flip ((:) . toChar)) [] | |
{- Count the number of occurance of pat in gene, with overlaps. -} | |
patternCount [] _ = error "Undefined for empty pattern" | |
patternCount pat gene = foldl' (\n g -> if pat `isPrefixOf` g then n+1 else n) 0 (tails gene) | |
skew :: Genome -> [Int] | |
skew l = scanl' gc 0 l | |
where | |
gc acc C = acc - 1 | |
gc acc G = acc + 1 | |
gc acc _ = acc | |
hamming s t = foldl' (\(!acc) b -> if b then (acc + 1) else acc) 0 $ zipWith (/=) s t | |
hammingClose d s t = hamming s t <= d | |
factors k s = take (length s - k + 1) $ map (take k) $ tails s | |
neighbourhood :: Int -> Genome -> [Genome] | |
neighbourhood 0 w = [w] | |
neighbourhood d (x:xs) = nub $ [ x : l | l <- neighbourhood d xs ] | |
++ [ a : l | l <- neighbourhood (d-1) xs, a <- [A,C,G,T], a /= x ] | |
neighbourhood n [] = [[]] | |
motifs :: Int -> Int -> [Genome] -> [Genome] | |
motifs k d ls = let ns = [ nub $ concatMap (neighbourhood d) (nub $ factors k l) | l <- ls ] | |
in foldl1' intersect ns | |
median :: Int -> [Genome] -> (Int, [Genome]) | |
median k ls = let batch = head $ filter (not . null) [ motifs k d ls | d <- [0..] ] | |
pairs = [ (dist b, b) | b <- batch ] | |
low = fst $ minimumBy score pairs | |
in (low, map snd $ filter (\(a,_) -> a == low) pairs) | |
where | |
score = comparing fst | |
dist a = sum [ minimum $ map (hamming a) $ factors k l | l <- ls ] | |
profileMax :: Genome -> Int -> [[Double]] -> Genome | |
profileMax g k m = fst $ foldl1' (\(a,pa) (b,pb) -> if pa >= pb then (a,pa) else (b,pb)) [ (f, profile f) | f <- factors k g ] | |
where | |
profile f = foldl1' (*) $ zipWith (\col a -> col !! (fromEnum a)) (transpose m) f | |
motifToProfile :: [Genome] -> [[Double]] | |
motifToProfile gs = let ls = map (foldl' count [0,0,0,0]) $ transpose gs | |
in transpose [ [ (fromIntegral r) / (fromIntegral $ sum l) | r <- l ] | l <- ls ] | |
where | |
count !(a:c:g:t:[]) A = ((a+1):c:g:t:[]) | |
count !(a:c:g:t:[]) C = (a:(c+1):g:t:[]) | |
count !(a:c:g:t:[]) G = (a:c:(g+1):t:[]) | |
count !(a:c:g:t:[]) T = (a:c:g:(t+1):[]) | |
motifToProfileL :: [Genome] -> [[Double]] | |
motifToProfileL gs = let ls = map (foldl' count [1,1,1,1]) $ transpose gs | |
in transpose [ [ (fromIntegral r) / (fromIntegral $ sum l) | r <- l ] | l <- ls ] | |
where | |
count !(a:c:g:t:[]) A = ((a+1):c:g:t:[]) | |
count !(a:c:g:t:[]) C = (a:(c+1):g:t:[]) | |
count !(a:c:g:t:[]) G = (a:c:(g+1):t:[]) | |
count !(a:c:g:t:[]) T = (a:c:g:(t+1):[]) | |
greedyMotif :: Int -> [Genome] -> [Genome] | |
greedyMotif k [] = [] | |
greedyMotif k (g:gs) = let ms = [ foldl' (\a b -> (profileMax b k (motifToProfile a)):a) [f] gs | f <- factors k g ] | |
in reverse $ minimumBy (comparing (profileScore . motifToProfileL)) ms | |
entropy prof = foldl1' (\acc p -> if p <= 0 then acc else acc - p * (logBase 2.0 p)) (concat prof) | |
profileScore prof = sum $ [ 1 - maximum c | c <- transpose prof ] | |
{- Random sequence -} | |
randomSeq l seed = let g = mkStdGen seed | |
in take l $ (randoms g :: [Nucleobase]) | |
randomMotifSearch k gs g = let (g', ms) = mapAccumL randomSelect g $ map (factors k) gs | |
msseq = map (\(a,b) -> (a, entropy b)) $ motifSequence k gs ms | |
pairs = zip msseq (tail msseq) | |
in case find (\((_,ha),(_,hb)) -> ha <= hb) pairs of | |
Just (x,_) -> (x, g') | |
Nothing -> (last msseq, g') | |
where | |
randomSelect g !l = let (i, g_) = randomR (0, length l - 1) g | |
in (g_, l !! i) | |
motifSequence k gs ms = iterate' f (ms, motifToProfile ms) | |
where | |
f (!ms, !p) = let ms' = map (\x -> profileMax x k p) gs | |
in (ms', motifToProfile ms') | |
{- Parsing arguments and running main -} | |
data Args = Factors { k :: Int, file :: Maybe FilePath } | |
| Neighbours { d :: Int, file :: Maybe FilePath } | |
| HammingDist { s :: Genome, t :: Genome } | |
| Skew { file :: Maybe FilePath, min :: Bool } | |
| PatternCount { p :: Genome, file :: Maybe FilePath } | |
| ReverseConjugate { file :: Maybe FilePath } | |
| RandomSequence { l :: Int, seed :: Maybe Int } | |
| Motifs { k :: Int, d :: Int, file :: Maybe FilePath } | |
| Median { k :: Int, file :: Maybe FilePath } | |
| Profile { file :: Maybe FilePath } | |
| GreedyMotif { k :: Int, file :: Maybe FilePath } | |
| RandomMotif { k :: Int, seed :: Maybe Int, trials :: Maybe Int, file :: Maybe FilePath } | |
| Entropy { file :: Maybe FilePath } | |
factorsParse = Factors | |
<$> argument auto (help "Extract all l-mers" <> metavar "LENGTH") | |
<*> (optional $ argument str (metavar "FILENAME" | |
<> help "Input file (or stdin if not specified)" )) | |
neighboursParse = Neighbours | |
<$> argument auto (help "Form a d-neighbourhood about a word" <> metavar "DISTANCE") | |
<*> (optional $ argument str | |
(metavar "FILENAME" | |
<> help "Input file (or stdin if not specified)" )) | |
hammingParse = HammingDist | |
<$> argument auto | |
(help "Left Genome" <> metavar "LEFT") | |
<*> argument auto | |
(help "Right Genome" <> metavar "RIGHT") | |
skewParse = Skew | |
<$> (optional $ strOption | |
( long "file" | |
<> short 'f' | |
<> metavar "FILENAME" | |
<> help "Input file (or stdin if not specified)" )) | |
<*> switch | |
( long "min" | |
<> short 'm' | |
<> help "Find location(s) of minimum" ) | |
patternParse = PatternCount | |
<$> argument auto | |
(help "Pattern to count occurances of" <> metavar "PATTERN") | |
<*> (optional $ argument str | |
(metavar "FILENAME" | |
<> help "Input file (or stdin if not specified)" )) | |
reverseParse = ReverseConjugate | |
<$> (optional $ argument str | |
(metavar "FILENAME" | |
<> help "Input file (or stdin if not specified)" )) | |
randomParse = RandomSequence | |
<$> argument auto (help "Random sequence of length LENGTH" <> metavar "LENGTH") | |
<*> (optional $ argument auto | |
(metavar "SEED" | |
<> help "Seed value (or 0 if not specified)")) | |
motifsParse = Motifs | |
<$> argument auto (help "k-mer motifs" <> metavar "LENGTH") | |
<*> argument auto (help "at most d mutations" <> metavar "DISTANCE") | |
<*> (optional $ argument str | |
(metavar "FILENAME" | |
<> help "Input file (or stdin if not specified)" )) | |
medianParse = Median | |
<$> argument auto (help "Consider k-mer median" <> metavar "LENGTH") | |
<*> (optional $ argument str | |
(metavar "FILENAME" | |
<> help "Input file (or stdin if not specified)" )) | |
profileParse = Profile | |
<$> (optional $ argument str | |
(metavar "FILENAME" | |
<> help "Input file (or stdin if not specified)" )) | |
greedyMotifParse = GreedyMotif | |
<$> argument auto (help "Consider k-mer medians" <> metavar "LENGTH") | |
<*> (optional $ argument str | |
(metavar "FILENAME" | |
<> help "Input file (or stdin if not specified)" )) | |
randomMotifParse = RandomMotif | |
<$> argument auto (help "Consider k-mer medians" <> metavar "LENGTH") | |
<*> (optional $ option auto | |
(metavar "TRIALS" | |
<> long "trials" | |
<> short 't' | |
<> help "Number of trials (or 1 if not specified)")) | |
<*> (optional $ option auto | |
(metavar "SEED" | |
<> long "seed" | |
<> short 's' | |
<> help "Seed value (or 1 if not specified)")) | |
<*> (optional $ argument str | |
(metavar "FILENAME" | |
<> help "Input file (or stdin if not specified)" )) | |
entropyParse = Entropy | |
<$> (optional $ argument str | |
(metavar "FILENAME" | |
<> help "Input file (or stdin if not specified)" )) | |
data ProfileSystem = ProfileSystem { profile_gene :: Genome, profile_k :: Int, profile_matrix :: [[Double]] } | |
instance Read ProfileSystem where | |
readsPrec _ s = let (g':i':ls) = lines s | |
in [(ProfileSystem (map fromChar g') (read i') [ map read (words l) | l <- ls ],"")] | |
argparse :: Parser Args | |
argparse = subparser ( | |
command "factors" (info factorsParse (progDesc "Extract all k-mers")) | |
<> command "neighbourhood" (info neighboursParse (progDesc "Return d-neighbourhood of input words")) | |
<> command "hamming" (info hammingParse (progDesc "Compute hamming distance between two strings")) | |
<> command "skew" (info skewParse (progDesc "Compute Skew of Genome")) | |
<> command "count" (info patternParse (progDesc "Count number of occurances of PATTERN")) | |
<> command "reverse" (info reverseParse (progDesc "Take the reverse conjugate of input")) | |
<> command "random" (info randomParse (progDesc "Generate a random sequence of fixed length")) | |
<> command "motifs" (info motifsParse (progDesc "Find (k,d) motifs of input strings")) | |
<> command "median" (info medianParse (progDesc "Find k-mer closest to all input strings")) | |
<> command "profile" (info profileParse (progDesc "Stupid profile algorithm.")) | |
<> command "greedymotif" (info greedyMotifParse (progDesc "Stupid motifs algorithm.")) | |
<> command "randommotif" (info randomMotifParse (progDesc "Randomly construct motifs")) | |
<> command "entropy" (info entropyParse (progDesc "Compute entropy (log2)")) | |
) | |
getGenomes :: Maybe FilePath -> IO [Genome] | |
getGenomes path = do | |
c <- case path of | |
Nothing -> try $ do | |
term <- hIsTerminalDevice stdin | |
if not term | |
then liftM (map fromText) $ liftM T.lines $ TIO.hGetContents stdin | |
else do | |
throwIO (IOError (Just stdin) NoSuchThing "" "Error: No pipe from stdin\n" Nothing Nothing) | |
return [] | |
Just path -> try $ do | |
s <- TIO.readFile path | |
return $ map fromText (T.lines s) | |
case c of | |
Left (IOError (Just stdin) _ _ m _ _) -> do | |
hPutStr stderr m | |
return [] | |
Left ex -> do | |
let err = show (ex :: IOException) | |
hPutStr stderr ("Error:\n " ++ (show ex) ++ "\n") | |
throwIO ex | |
return [] | |
Right content -> return content | |
main :: IO () | |
main = do | |
args <- customExecParser (prefs showHelpOnEmpty) opts | |
case args of | |
Factors k path -> do | |
gs <- getGenomes path | |
mapM_ (\l -> mapM_ (putStrLn . show) $ factors k l) gs | |
Neighbours d path -> do | |
gs <- getGenomes path | |
mapM_ (\l -> mapM_ (putStrLn . show) $ neighbourhood d l) gs | |
HammingDist s t -> print $ hamming s t | |
Skew path min -> do | |
gs <- getGenomes path | |
mapM_ (\l -> if l == [] then return () else | |
if min | |
then do | |
let s = skew l | |
let m = minimum s | |
putStrLn ("Minimum skew is " ++ (show m)) | |
putStr "At: " | |
putStr $ intercalate ", " $ map show (elemIndices m s) | |
putStrLn "" | |
else print $ skew l) gs | |
PatternCount p path -> do | |
gs <- getGenomes path | |
mapM_ (\l -> print $ patternCount p l) gs | |
ReverseConjugate path -> do | |
gs <- getGenomes path | |
mapM_ (\l -> if l == [] | |
then return () | |
else print $ conjugate l) gs | |
RandomSequence l Nothing -> print $ randomSeq l 0 | |
RandomSequence l (Just s) -> print $ randomSeq l s | |
Motifs k d path -> do | |
gs <- getGenomes path | |
if gs == [] | |
then hPutStr stderr ("No Input Strings!") | |
else mapM_ (putStrLn . show) (motifs k d gs) | |
Median k path -> do | |
gs <- getGenomes path | |
if gs == [] | |
then hPutStr stderr ("No Input Strings!") | |
else let (score, motifs) = median k gs | |
in putStrLn ("Motifs: " ++ show motifs ++ ", score: " ++ show score) | |
Profile path -> do | |
c <- case path of | |
Nothing -> try $ do | |
term <- hIsTerminalDevice stdin | |
if not term | |
then TIO.hGetContents stdin | |
else do | |
throwIO (IOError (Just stdin) NoSuchThing "" "Error: No pipe from stdin\n" Nothing Nothing) | |
return T.empty | |
Just path -> try $ TIO.readFile path | |
str <- case c of | |
Left (IOError (Just stdin) _ _ m _ _) -> do | |
hPutStr stderr m | |
return T.empty | |
Left ex -> do | |
let err = show (ex :: IOException) | |
hPutStr stderr ("Error:\n " ++ (show ex) ++ "\n") | |
throwIO ex | |
Right content -> return content | |
let ProfileSystem g i m = read (T.unpack str) | |
print $ profileMax g i m | |
GreedyMotif k path -> do | |
gs <- getGenomes path | |
if gs == [] | |
then hPutStr stderr ("No Input Strings!") | |
else let motifs = greedyMotif k gs | |
in mapM_ (putStrLn . show) motifs | |
RandomMotif k trials seed path -> do | |
gs <- getGenomes path | |
let s = case seed of | |
Nothing -> 1 | |
Just n -> n | |
let g = mkStdGen s | |
let t = case trials of | |
Nothing -> 1 | |
Just n -> n | |
if gs == [] | |
then hPutStr stderr ("No Input Strings!") | |
else let ms = take t $ unfoldr (\g_ -> Just (randomMotifSearch k gs g_)) g | |
(m,h) = foldl1' (\a b -> case comparing snd a b of | |
LT -> a | |
GT -> b | |
EQ -> a) ms | |
in do mapM_ (putStrLn . show) m | |
putStrLn "" | |
putStrLn ("Entropy: " ++ show h) | |
Entropy path -> do | |
gs <- getGenomes path | |
print (entropy $ motifToProfile gs) | |
where | |
opts = info (argparse <**> helper) | |
( fullDesc | |
<> progDesc "A collection of functions for DNA stuff" | |
<> header "dna: A collection of bioinformatics algorithms" ) | |
example :: [Genome] | |
example = [[C,G,C,C,C,C,T,C,T,C,G,G,G,G,G,T,G,T,T,C,A,G,T,A,A,A,C,G,G,C,C,A], | |
[G,G,G,C,G,A,G,G,T,A,T,G,T,G,T,A,A,G,T,G,C,C,A,A,G,G,T,G,C,C,A,G], | |
[T,A,G,T,A,C,C,G,A,G,A,C,C,G,A,A,A,G,A,A,G,T,A,T,A,C,A,G,G,C,G,T], | |
[T,A,G,A,T,C,A,A,G,T,T,T,C,A,G,G,T,G,C,A,C,G,T,C,G,G,T,G,A,A,C,C], | |
[A,A,T,C,C,A,C,C,A,G,C,T,C,C,A,C,G,T,G,C,A,A,T,G,T,T,G,G,C,C,T,A] ] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment