blairdrummond/Main.hs

## Main.hs
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE FlexibleInstances #-}

module Main where

-- parsing arguments
import Control.Exception
import Control.Monad
import Data.Semigroup ((<>))
import Options.Applicative
import GHC.IO.Exception

import Prelude hiding (catch, readFile, lines)
import System.IO.Error hiding (catch)
import System.IO
import System.Random
import Data.Foldable
import qualified Data.Text     as T
import qualified Data.Text.IO  as TIO


-- for the program
-- import Data.Array
import Data.List
import Data.Ord
-- import Data.String
import Data.Char
import Data.Maybe
-- import GHC.Int

-- import qualified Data.Text.Lazy    as T
-- import qualified Data.Text.Lazy.IO as TI
-- import qualified Data.Map.Strict   as M


{- The data types and functions -}

data Nucleobase = A | C | G | T
                  deriving (Read, Eq, Show, Ord, Bounded, Enum)

instance Random Nucleobase where
  randomR (a, b) g =
    case randomR (fromEnum a, fromEnum b) g of
      (x, g') -> (toEnum x, g')
  random g = randomR (minBound, maxBound) g

fromChar 'A' = A
fromChar 'C' = C
fromChar 'G' = G
fromChar 'T' = T
fromChar  x  = error $ "Character '" ++ (x:"") ++ "' not valid. A | C | G | T  only!"

toChar A = 'A'
toChar C = 'C'
toChar G = 'G'
toChar T = 'T'

op :: Nucleobase -> Nucleobase
op A = T
op T = A
op G = C
op C = G

type Genome = [Nucleobase]

instance {-# OVERLAPPING #-} Show [Nucleobase] where
  show = toStr

instance {-# OVERLAPPING #-} Read Genome where
  readsPrec _ l = [(map fromChar l,"")]

conjugate :: Genome -> Genome
conjugate g = map op $ reverse g

fromText :: T.Text -> Genome
-- fromText s = foldl' (flip  ((:) . fromChar)) [] $ takeWhile (not . isSpace) s
fromText = (T.foldl' (flip ((:) . fromChar)) []) . T.strip

toStr :: Genome -> String
toStr = foldl' (flip ((:) . toChar)) []


{- Count the number of occurance of pat in gene, with overlaps. -}
patternCount []  _    = error "Undefined for empty pattern"
patternCount pat gene = foldl' (\n g -> if pat `isPrefixOf` g then n+1 else n) 0 (tails gene)

skew :: Genome -> [Int]
skew l = scanl' gc 0 l
         where
           gc acc C = acc - 1
           gc acc G = acc + 1
           gc acc _ = acc


hamming s t = foldl' (\(!acc) b -> if b then (acc + 1) else acc) 0 $ zipWith (/=) s t

hammingClose d s t = hamming s t <= d

factors k s =  take (length s - k + 1) $ map (take k) $ tails s


neighbourhood :: Int -> Genome -> [Genome]
neighbourhood 0   w    = [w]
neighbourhood d (x:xs) = nub $ [ x : l | l <- neighbourhood d xs ]
                         ++ [ a : l | l <- neighbourhood (d-1) xs, a <- [A,C,G,T], a /= x ]
neighbourhood n  []    = [[]]


motifs :: Int -> Int -> [Genome] -> [Genome]
motifs k d ls = let ns = [ nub $ concatMap (neighbourhood d) (nub $ factors k l) | l <- ls ]
                in  foldl1' intersect ns


median :: Int -> [Genome] -> (Int, [Genome])
median k ls = let batch = head $ filter (not . null) [ motifs k d ls | d <- [0..] ]
                  pairs = [ (dist b, b) | b <- batch ]
                  low   = fst $ minimumBy score pairs
              in (low, map snd $ filter (\(a,_) -> a == low) pairs)
  where
    score = comparing fst
    dist a = sum [ minimum $ map (hamming a) $ factors k l | l <- ls ]


profileMax :: Genome -> Int -> [[Double]] -> Genome
profileMax g k m = fst $ foldl1' (\(a,pa) (b,pb) -> if pa >= pb then (a,pa) else (b,pb)) [ (f, profile f) | f <- factors k g ]
  where
    profile f = foldl1' (*) $ zipWith (\col a -> col !! (fromEnum a)) (transpose m) f


motifToProfile :: [Genome] -> [[Double]]
motifToProfile gs = let ls = map (foldl' count [0,0,0,0]) $ transpose gs
                    in  transpose [ [ (fromIntegral r) / (fromIntegral $ sum l) | r <- l ] | l <- ls ]
  where
    count !(a:c:g:t:[]) A = ((a+1):c:g:t:[])
    count !(a:c:g:t:[]) C = (a:(c+1):g:t:[])
    count !(a:c:g:t:[]) G = (a:c:(g+1):t:[])
    count !(a:c:g:t:[]) T = (a:c:g:(t+1):[])


motifToProfileL :: [Genome] -> [[Double]]
motifToProfileL gs = let ls = map (foldl' count [1,1,1,1]) $ transpose gs
                     in  transpose [ [ (fromIntegral r) / (fromIntegral $ sum l) | r <- l ] | l <- ls ]
  where
    count !(a:c:g:t:[]) A = ((a+1):c:g:t:[])
    count !(a:c:g:t:[]) C = (a:(c+1):g:t:[])
    count !(a:c:g:t:[]) G = (a:c:(g+1):t:[])
    count !(a:c:g:t:[]) T = (a:c:g:(t+1):[])


greedyMotif :: Int -> [Genome] -> [Genome]
greedyMotif k [] = []
greedyMotif k (g:gs) = let ms = [ foldl' (\a b -> (profileMax b k (motifToProfile a)):a) [f] gs | f <- factors k g ]
                       in reverse $ minimumBy (comparing (profileScore . motifToProfileL)) ms

entropy      prof = foldl1'  (\acc p -> if p <= 0 then acc else acc - p * (logBase 2.0 p)) (concat prof)
profileScore prof = sum $ [ 1 - maximum c | c <- transpose prof ]


{- Random sequence -}
randomSeq l seed = let g = mkStdGen seed
                   in take l $ (randoms g :: [Nucleobase])

randomMotifSearch k gs g = let (g', ms) = mapAccumL randomSelect g $ map (factors k) gs
                               msseq = map (\(a,b) -> (a, entropy b)) $ motifSequence k gs ms
                               pairs = zip msseq (tail msseq)
                           in case find (\((_,ha),(_,hb)) -> ha <= hb) pairs of
                                Just (x,_) -> (x, g')
                                Nothing    -> (last msseq, g')
  where
    randomSelect g !l = let (i, g_) = randomR (0, length l - 1) g
                        in (g_, l !! i)

motifSequence k gs ms = iterate' f (ms, motifToProfile ms)
  where
    f (!ms, !p) = let ms' = map (\x -> profileMax x k p) gs
                  in (ms', motifToProfile ms')


{- Parsing arguments and running main -}
data Args = Factors           { k :: Int, file :: Maybe FilePath }
          | Neighbours        { d :: Int, file :: Maybe FilePath }
          | HammingDist       { s :: Genome, t :: Genome }
          | Skew              { file :: Maybe FilePath, min :: Bool }
          | PatternCount      { p :: Genome, file :: Maybe FilePath }
          | ReverseConjugate  { file :: Maybe FilePath }
          | RandomSequence    { l :: Int, seed :: Maybe Int }
          | Motifs            { k :: Int, d :: Int, file :: Maybe FilePath }
          | Median            { k :: Int, file :: Maybe FilePath }
          | Profile           { file :: Maybe FilePath }
          | GreedyMotif       { k :: Int, file :: Maybe FilePath }
          | RandomMotif       { k :: Int, seed :: Maybe Int, trials :: Maybe Int, file :: Maybe FilePath }
          | Entropy           { file :: Maybe FilePath }


factorsParse = Factors
               <$> argument auto (help "Extract all l-mers" <> metavar "LENGTH")
               <*> (optional $ argument str (metavar "FILENAME"
                 <> help "Input file (or stdin if not specified)" ))

neighboursParse = Neighbours
               <$> argument auto (help "Form a d-neighbourhood about a word" <> metavar "DISTANCE")
               <*> (optional $ argument str
               (metavar "FILENAME"
                 <> help "Input file (or stdin if not specified)" ))


hammingParse = HammingDist
               <$> argument auto
               (help "Left Genome" <> metavar "LEFT")
               <*> argument auto
               (help "Right Genome" <> metavar "RIGHT")


skewParse = Skew
            <$> (optional $ strOption
            ( long "file"
              <> short 'f'
              <> metavar "FILENAME"
              <> help "Input file (or stdin if not specified)" ))
            <*> switch
            ( long "min"
              <> short 'm'
              <> help "Find location(s) of minimum" )


patternParse = PatternCount
               <$> argument auto
               (help "Pattern to count occurances of" <> metavar "PATTERN")
               <*> (optional $ argument str
                    (metavar "FILENAME"
                     <> help "Input file (or stdin if not specified)" ))


reverseParse = ReverseConjugate
            <$> (optional $ argument str
                 (metavar "FILENAME"
                  <> help "Input file (or stdin if not specified)" ))


randomParse = RandomSequence
              <$> argument auto (help "Random sequence of length LENGTH" <> metavar "LENGTH")
              <*> (optional $ argument auto
                   (metavar "SEED"
                    <> help "Seed value (or 0 if not specified)"))

motifsParse = Motifs
              <$> argument auto (help "k-mer motifs" <> metavar "LENGTH")
              <*> argument auto (help "at most d mutations" <> metavar "DISTANCE")
              <*> (optional $ argument str
                   (metavar "FILENAME"
                    <> help "Input file (or stdin if not specified)" ))


medianParse = Median
              <$> argument auto (help "Consider k-mer median" <> metavar "LENGTH")
              <*> (optional $ argument str
                   (metavar "FILENAME"
                    <> help "Input file (or stdin if not specified)" ))


profileParse = Profile
               <$> (optional $ argument str
                    (metavar "FILENAME"
                     <> help "Input file (or stdin if not specified)" ))

greedyMotifParse = GreedyMotif
                   <$> argument auto (help "Consider k-mer medians" <> metavar "LENGTH")
                   <*> (optional $ argument str
                        (metavar "FILENAME"
                         <> help "Input file (or stdin if not specified)" ))

randomMotifParse = RandomMotif
                   <$> argument auto (help "Consider k-mer medians" <> metavar "LENGTH")
                   <*> (optional $ option auto
                        (metavar "TRIALS"
                         <> long "trials"
                         <> short 't'
                         <> help "Number of trials (or 1 if not specified)"))
                   <*> (optional $ option auto
                        (metavar "SEED"
                         <> long "seed"
                         <> short 's'
                         <> help "Seed value (or 1 if not specified)"))
                   <*> (optional $ argument str
                        (metavar "FILENAME"
                         <> help "Input file (or stdin if not specified)" ))

entropyParse = Entropy
               <$> (optional $ argument str
                    (metavar "FILENAME"
                     <> help "Input file (or stdin if not specified)" ))


data ProfileSystem = ProfileSystem { profile_gene :: Genome, profile_k :: Int, profile_matrix :: [[Double]] }

instance Read ProfileSystem where
  readsPrec _ s = let (g':i':ls) = lines s
                  in [(ProfileSystem (map fromChar g') (read i') [ map read (words l) | l <- ls ],"")]


argparse :: Parser Args
argparse = subparser (
     command "factors"       (info factorsParse     (progDesc "Extract all k-mers"))
  <> command "neighbourhood" (info neighboursParse  (progDesc "Return d-neighbourhood of input words"))
  <> command "hamming"       (info hammingParse     (progDesc "Compute hamming distance between two strings"))
  <> command "skew"          (info skewParse        (progDesc "Compute Skew of Genome"))
  <> command "count"         (info patternParse     (progDesc "Count number of occurances of PATTERN"))
  <> command "reverse"       (info reverseParse     (progDesc "Take the reverse conjugate of input"))
  <> command "random"        (info randomParse      (progDesc "Generate a random sequence of fixed length"))
  <> command "motifs"        (info motifsParse      (progDesc "Find (k,d) motifs of input strings"))
  <> command "median"        (info medianParse      (progDesc "Find k-mer closest to all input strings"))
  <> command "profile"       (info profileParse     (progDesc "Stupid profile algorithm."))
  <> command "greedymotif"   (info greedyMotifParse (progDesc "Stupid motifs algorithm."))
  <> command "randommotif"   (info randomMotifParse (progDesc "Randomly construct motifs"))
  <> command "entropy"       (info entropyParse     (progDesc "Compute entropy (log2)"))
     )


getGenomes :: Maybe FilePath -> IO [Genome]
getGenomes path = do
  c <- case path of
         Nothing -> try $ do
           term <- hIsTerminalDevice stdin
           if not term
             then liftM (map fromText) $ liftM T.lines $ TIO.hGetContents stdin
             else do
             throwIO (IOError (Just stdin) NoSuchThing "" "Error: No pipe from stdin\n" Nothing Nothing)
             return []

         Just path -> try $ do
           s <- TIO.readFile path
           return $ map fromText (T.lines s)

  case c of
    Left (IOError (Just stdin) _ _ m _ _) -> do
      hPutStr stderr m
      return []

    Left ex -> do
      let err = show (ex :: IOException)
      hPutStr stderr ("Error:\n    " ++ (show ex) ++ "\n")
      throwIO ex
      return []

    Right content -> return content


main :: IO ()
main = do
  args <- customExecParser (prefs showHelpOnEmpty) opts
  case args of
    Factors k path -> do
      gs <- getGenomes path
      mapM_ (\l -> mapM_ (putStrLn . show) $ factors k l) gs

    Neighbours d path -> do
      gs <- getGenomes path
      mapM_ (\l -> mapM_ (putStrLn . show) $ neighbourhood d l) gs

    HammingDist s t -> print $ hamming s t

    Skew path min -> do
      gs <- getGenomes path
      mapM_ (\l -> if l == [] then return () else
                     if min
                     then do
                       let s = skew l
                       let m = minimum s
                       putStrLn ("Minimum skew is " ++ (show m))
                       putStr   "At: "
                       putStr $ intercalate ", " $ map show (elemIndices m s)
                       putStrLn ""
                     else print $ skew l) gs

    PatternCount p path -> do
      gs <- getGenomes path
      mapM_ (\l -> print $ patternCount p l) gs

    ReverseConjugate path -> do
      gs <- getGenomes path
      mapM_ (\l -> if l == []
                   then return ()
                   else print $ conjugate l) gs

    RandomSequence l Nothing  -> print $ randomSeq l 0
    RandomSequence l (Just s) -> print $ randomSeq l s

    Motifs k d path -> do
      gs <- getGenomes path
      if gs == []
        then hPutStr stderr ("No Input Strings!")
        else mapM_ (putStrLn . show) (motifs k d gs)

    Median k path -> do
      gs <- getGenomes path
      if gs == []
        then hPutStr stderr ("No Input Strings!")
        else let (score, motifs) = median k gs
             in  putStrLn ("Motifs: " ++ show motifs ++ ", score: " ++ show score)

    Profile path -> do
      c <- case path of
             Nothing -> try $ do
               term <- hIsTerminalDevice stdin
               if not term
                 then TIO.hGetContents stdin
                 else do
                 throwIO (IOError (Just stdin) NoSuchThing "" "Error: No pipe from stdin\n" Nothing Nothing)
                 return T.empty

             Just path -> try $ TIO.readFile path

      str <- case c of
               Left (IOError (Just stdin) _ _ m _ _) -> do
                 hPutStr stderr m
                 return T.empty

               Left ex -> do
                 let err = show (ex :: IOException)
                 hPutStr stderr ("Error:\n    " ++ (show ex) ++ "\n")
                 throwIO ex

               Right content -> return content

      let ProfileSystem g i m = read (T.unpack str)
      print $ profileMax g i m


    GreedyMotif k path -> do
      gs <- getGenomes path
      if gs == []
        then hPutStr stderr ("No Input Strings!")
        else let motifs = greedyMotif k gs
             in  mapM_ (putStrLn . show) motifs

    RandomMotif k trials seed path -> do
      gs <- getGenomes path
      let s = case seed of
            Nothing -> 1
            Just n -> n
      let g = mkStdGen s
      let t = case trials of
            Nothing -> 1
            Just n -> n

      if gs == []
        then hPutStr stderr ("No Input Strings!")
        else let ms = take t $ unfoldr (\g_ -> Just (randomMotifSearch k gs g_)) g
                 (m,h) = foldl1' (\a b -> case comparing snd a b of
                                            LT -> a
                                            GT -> b
                                            EQ -> a) ms
             in do mapM_ (putStrLn . show) m
                   putStrLn ""
                   putStrLn ("Entropy: " ++ show h)

    Entropy path -> do
      gs <- getGenomes path
      print (entropy $ motifToProfile gs)

    where
      opts = info (argparse <**> helper)
        ( fullDesc
          <> progDesc "A collection of functions for DNA stuff"
          <> header "dna: A collection of bioinformatics algorithms" )


example :: [Genome]
example = [[C,G,C,C,C,C,T,C,T,C,G,G,G,G,G,T,G,T,T,C,A,G,T,A,A,A,C,G,G,C,C,A],
           [G,G,G,C,G,A,G,G,T,A,T,G,T,G,T,A,A,G,T,G,C,C,A,A,G,G,T,G,C,C,A,G],
           [T,A,G,T,A,C,C,G,A,G,A,C,C,G,A,A,A,G,A,A,G,T,A,T,A,C,A,G,G,C,G,T],
           [T,A,G,A,T,C,A,A,G,T,T,T,C,A,G,G,T,G,C,A,C,G,T,C,G,G,T,G,A,A,C,C],
           [A,A,T,C,C,A,C,C,A,G,C,T,C,C,A,C,G,T,G,C,A,A,T,G,T,T,G,G,C,C,T,A] ]
	{-# LANGUAGE BangPatterns #-}
	{-# LANGUAGE FlexibleInstances #-}

	module Main where

	-- parsing arguments
	import Control.Exception
	import Control.Monad
	import Data.Semigroup ((<>))
	import Options.Applicative
	import GHC.IO.Exception

	import Prelude hiding (catch, readFile, lines)
	import System.IO.Error hiding (catch)
	import System.IO
	import System.Random
	import Data.Foldable
	import qualified Data.Text as T
	import qualified Data.Text.IO as TIO


	-- for the program
	-- import Data.Array
	import Data.List
	import Data.Ord
	-- import Data.String
	import Data.Char
	import Data.Maybe
	-- import GHC.Int

	-- import qualified Data.Text.Lazy as T
	-- import qualified Data.Text.Lazy.IO as TI
	-- import qualified Data.Map.Strict as M



	{- The data types and functions -}

	data Nucleobase = A \| C \| G \| T
	deriving (Read, Eq, Show, Ord, Bounded, Enum)

	instance Random Nucleobase where
	randomR (a, b) g =
	case randomR (fromEnum a, fromEnum b) g of
	(x, g') -> (toEnum x, g')
	random g = randomR (minBound, maxBound) g

	fromChar 'A' = A
	fromChar 'C' = C
	fromChar 'G' = G
	fromChar 'T' = T
	fromChar x = error $ "Character '" ++ (x:"") ++ "' not valid. A \| C \| G \| T only!"

	toChar A = 'A'
	toChar C = 'C'
	toChar G = 'G'
	toChar T = 'T'

	op :: Nucleobase -> Nucleobase
	op A = T
	op T = A
	op G = C
	op C = G

	type Genome = [Nucleobase]

	instance {-# OVERLAPPING #-} Show [Nucleobase] where
	show = toStr

	instance {-# OVERLAPPING #-} Read Genome where
	readsPrec _ l = [(map fromChar l,"")]

	conjugate :: Genome -> Genome
	conjugate g = map op $ reverse g

	fromText :: T.Text -> Genome
	-- fromText s = foldl' (flip ((:) . fromChar)) [] $ takeWhile (not . isSpace) s
	fromText = (T.foldl' (flip ((:) . fromChar)) []) . T.strip

	toStr :: Genome -> String
	toStr = foldl' (flip ((:) . toChar)) []


	{- Count the number of occurance of pat in gene, with overlaps. -}
	patternCount [] _ = error "Undefined for empty pattern"
	patternCount pat gene = foldl' (\n g -> if pat `isPrefixOf` g then n+1 else n) 0 (tails gene)

	skew :: Genome -> [Int]
	skew l = scanl' gc 0 l
	where
	gc acc C = acc - 1
	gc acc G = acc + 1
	gc acc _ = acc


	hamming s t = foldl' (\(!acc) b -> if b then (acc + 1) else acc) 0 $ zipWith (/=) s t

	hammingClose d s t = hamming s t <= d

	factors k s = take (length s - k + 1) $ map (take k) $ tails s


	neighbourhood :: Int -> Genome -> [Genome]
	neighbourhood 0 w = [w]
	neighbourhood d (x:xs) = nub $ [ x : l \| l <- neighbourhood d xs ]
	++ [ a : l \| l <- neighbourhood (d-1) xs, a <- [A,C,G,T], a /= x ]
	neighbourhood n [] = [[]]


	motifs :: Int -> Int -> [Genome] -> [Genome]
	motifs k d ls = let ns = [ nub $ concatMap (neighbourhood d) (nub $ factors k l) \| l <- ls ]
	in foldl1' intersect ns


	median :: Int -> [Genome] -> (Int, [Genome])
	median k ls = let batch = head $ filter (not . null) [ motifs k d ls \| d <- [0..] ]
	pairs = [ (dist b, b) \| b <- batch ]
	low = fst $ minimumBy score pairs
	in (low, map snd $ filter (\(a,_) -> a == low) pairs)
	where
	score = comparing fst
	dist a = sum [ minimum $ map (hamming a) $ factors k l \| l <- ls ]


	profileMax :: Genome -> Int -> [[Double]] -> Genome
	profileMax g k m = fst $ foldl1' (\(a,pa) (b,pb) -> if pa >= pb then (a,pa) else (b,pb)) [ (f, profile f) \| f <- factors k g ]
	where
	profile f = foldl1' (*) $ zipWith (\col a -> col !! (fromEnum a)) (transpose m) f


	motifToProfile :: [Genome] -> [[Double]]
	motifToProfile gs = let ls = map (foldl' count [0,0,0,0]) $ transpose gs
	in transpose [ [ (fromIntegral r) / (fromIntegral $ sum l) \| r <- l ] \| l <- ls ]
	where
	count !(a:c:g:t:[]) A = ((a+1):c:g:t:[])
	count !(a:c:g:t:[]) C = (a:(c+1):g:t:[])
	count !(a:c:g:t:[]) G = (a:c:(g+1):t:[])
	count !(a:c:g:t:[]) T = (a:c:g:(t+1):[])


	motifToProfileL :: [Genome] -> [[Double]]
	motifToProfileL gs = let ls = map (foldl' count [1,1,1,1]) $ transpose gs
	in transpose [ [ (fromIntegral r) / (fromIntegral $ sum l) \| r <- l ] \| l <- ls ]
	where
	count !(a:c:g:t:[]) A = ((a+1):c:g:t:[])
	count !(a:c:g:t:[]) C = (a:(c+1):g:t:[])
	count !(a:c:g:t:[]) G = (a:c:(g+1):t:[])
	count !(a:c:g:t:[]) T = (a:c:g:(t+1):[])


	greedyMotif :: Int -> [Genome] -> [Genome]
	greedyMotif k [] = []
	greedyMotif k (g:gs) = let ms = [ foldl' (\a b -> (profileMax b k (motifToProfile a)):a) [f] gs \| f <- factors k g ]
	in reverse $ minimumBy (comparing (profileScore . motifToProfileL)) ms

	entropy prof = foldl1' (\acc p -> if p <= 0 then acc else acc - p * (logBase 2.0 p)) (concat prof)
	profileScore prof = sum $ [ 1 - maximum c \| c <- transpose prof ]


	{- Random sequence -}
	randomSeq l seed = let g = mkStdGen seed
	in take l $ (randoms g :: [Nucleobase])

	randomMotifSearch k gs g = let (g', ms) = mapAccumL randomSelect g $ map (factors k) gs
	msseq = map (\(a,b) -> (a, entropy b)) $ motifSequence k gs ms
	pairs = zip msseq (tail msseq)
	in case find (\((_,ha),(_,hb)) -> ha <= hb) pairs of
	Just (x,_) -> (x, g')
	Nothing -> (last msseq, g')
	where
	randomSelect g !l = let (i, g_) = randomR (0, length l - 1) g
	in (g_, l !! i)

	motifSequence k gs ms = iterate' f (ms, motifToProfile ms)
	where
	f (!ms, !p) = let ms' = map (\x -> profileMax x k p) gs
	in (ms', motifToProfile ms')





	{- Parsing arguments and running main -}
	data Args = Factors { k :: Int, file :: Maybe FilePath }
	\| Neighbours { d :: Int, file :: Maybe FilePath }
	\| HammingDist { s :: Genome, t :: Genome }
	\| Skew { file :: Maybe FilePath, min :: Bool }
	\| PatternCount { p :: Genome, file :: Maybe FilePath }
	\| ReverseConjugate { file :: Maybe FilePath }
	\| RandomSequence { l :: Int, seed :: Maybe Int }
	\| Motifs { k :: Int, d :: Int, file :: Maybe FilePath }
	\| Median { k :: Int, file :: Maybe FilePath }
	\| Profile { file :: Maybe FilePath }
	\| GreedyMotif { k :: Int, file :: Maybe FilePath }
	\| RandomMotif { k :: Int, seed :: Maybe Int, trials :: Maybe Int, file :: Maybe FilePath }
	\| Entropy { file :: Maybe FilePath }


	factorsParse = Factors
	<$> argument auto (help "Extract all l-mers" <> metavar "LENGTH")
	<*> (optional $ argument str (metavar "FILENAME"
	<> help "Input file (or stdin if not specified)" ))

	neighboursParse = Neighbours
	<$> argument auto (help "Form a d-neighbourhood about a word" <> metavar "DISTANCE")
	<*> (optional $ argument str
	(metavar "FILENAME"
	<> help "Input file (or stdin if not specified)" ))


	hammingParse = HammingDist
	<$> argument auto
	(help "Left Genome" <> metavar "LEFT")
	<*> argument auto
	(help "Right Genome" <> metavar "RIGHT")


	skewParse = Skew
	<$> (optional $ strOption
	( long "file"
	<> short 'f'
	<> metavar "FILENAME"
	<> help "Input file (or stdin if not specified)" ))
	<*> switch
	( long "min"
	<> short 'm'
	<> help "Find location(s) of minimum" )


	patternParse = PatternCount
	<$> argument auto
	(help "Pattern to count occurances of" <> metavar "PATTERN")
	<*> (optional $ argument str
	(metavar "FILENAME"
	<> help "Input file (or stdin if not specified)" ))


	reverseParse = ReverseConjugate
	<$> (optional $ argument str
	(metavar "FILENAME"
	<> help "Input file (or stdin if not specified)" ))


	randomParse = RandomSequence
	<$> argument auto (help "Random sequence of length LENGTH" <> metavar "LENGTH")
	<*> (optional $ argument auto
	(metavar "SEED"
	<> help "Seed value (or 0 if not specified)"))

	motifsParse = Motifs
	<$> argument auto (help "k-mer motifs" <> metavar "LENGTH")
	<*> argument auto (help "at most d mutations" <> metavar "DISTANCE")
	<*> (optional $ argument str
	(metavar "FILENAME"
	<> help "Input file (or stdin if not specified)" ))


	medianParse = Median
	<$> argument auto (help "Consider k-mer median" <> metavar "LENGTH")
	<*> (optional $ argument str
	(metavar "FILENAME"
	<> help "Input file (or stdin if not specified)" ))


	profileParse = Profile
	<$> (optional $ argument str
	(metavar "FILENAME"
	<> help "Input file (or stdin if not specified)" ))

	greedyMotifParse = GreedyMotif
	<$> argument auto (help "Consider k-mer medians" <> metavar "LENGTH")
	<*> (optional $ argument str
	(metavar "FILENAME"
	<> help "Input file (or stdin if not specified)" ))

	randomMotifParse = RandomMotif
	<$> argument auto (help "Consider k-mer medians" <> metavar "LENGTH")
	<*> (optional $ option auto
	(metavar "TRIALS"
	<> long "trials"
	<> short 't'
	<> help "Number of trials (or 1 if not specified)"))
	<*> (optional $ option auto
	(metavar "SEED"
	<> long "seed"
	<> short 's'
	<> help "Seed value (or 1 if not specified)"))
	<*> (optional $ argument str
	(metavar "FILENAME"
	<> help "Input file (or stdin if not specified)" ))

	entropyParse = Entropy
	<$> (optional $ argument str
	(metavar "FILENAME"
	<> help "Input file (or stdin if not specified)" ))


	data ProfileSystem = ProfileSystem { profile_gene :: Genome, profile_k :: Int, profile_matrix :: [[Double]] }

	instance Read ProfileSystem where
	readsPrec _ s = let (g':i':ls) = lines s
	in [(ProfileSystem (map fromChar g') (read i') [ map read (words l) \| l <- ls ],"")]



	argparse :: Parser Args
	argparse = subparser (
	command "factors" (info factorsParse (progDesc "Extract all k-mers"))
	<> command "neighbourhood" (info neighboursParse (progDesc "Return d-neighbourhood of input words"))
	<> command "hamming" (info hammingParse (progDesc "Compute hamming distance between two strings"))
	<> command "skew" (info skewParse (progDesc "Compute Skew of Genome"))
	<> command "count" (info patternParse (progDesc "Count number of occurances of PATTERN"))
	<> command "reverse" (info reverseParse (progDesc "Take the reverse conjugate of input"))
	<> command "random" (info randomParse (progDesc "Generate a random sequence of fixed length"))
	<> command "motifs" (info motifsParse (progDesc "Find (k,d) motifs of input strings"))
	<> command "median" (info medianParse (progDesc "Find k-mer closest to all input strings"))
	<> command "profile" (info profileParse (progDesc "Stupid profile algorithm."))
	<> command "greedymotif" (info greedyMotifParse (progDesc "Stupid motifs algorithm."))
	<> command "randommotif" (info randomMotifParse (progDesc "Randomly construct motifs"))
	<> command "entropy" (info entropyParse (progDesc "Compute entropy (log2)"))
	)



	getGenomes :: Maybe FilePath -> IO [Genome]
	getGenomes path = do
	c <- case path of
	Nothing -> try $ do
	term <- hIsTerminalDevice stdin
	if not term
	then liftM (map fromText) $ liftM T.lines $ TIO.hGetContents stdin
	else do
	throwIO (IOError (Just stdin) NoSuchThing "" "Error: No pipe from stdin\n" Nothing Nothing)
	return []

	Just path -> try $ do
	s <- TIO.readFile path
	return $ map fromText (T.lines s)

	case c of
	Left (IOError (Just stdin) _ _ m _ _) -> do
	hPutStr stderr m
	return []

	Left ex -> do
	let err = show (ex :: IOException)
	hPutStr stderr ("Error:\n " ++ (show ex) ++ "\n")
	throwIO ex
	return []

	Right content -> return content



	main :: IO ()
	main = do
	args <- customExecParser (prefs showHelpOnEmpty) opts
	case args of
	Factors k path -> do
	gs <- getGenomes path
	mapM_ (\l -> mapM_ (putStrLn . show) $ factors k l) gs

	Neighbours d path -> do
	gs <- getGenomes path
	mapM_ (\l -> mapM_ (putStrLn . show) $ neighbourhood d l) gs

	HammingDist s t -> print $ hamming s t

	Skew path min -> do
	gs <- getGenomes path
	mapM_ (\l -> if l == [] then return () else
	if min
	then do
	let s = skew l
	let m = minimum s
	putStrLn ("Minimum skew is " ++ (show m))
	putStr "At: "
	putStr $ intercalate ", " $ map show (elemIndices m s)
	putStrLn ""
	else print $ skew l) gs

	PatternCount p path -> do
	gs <- getGenomes path
	mapM_ (\l -> print $ patternCount p l) gs

	ReverseConjugate path -> do
	gs <- getGenomes path
	mapM_ (\l -> if l == []
	then return ()
	else print $ conjugate l) gs

	RandomSequence l Nothing -> print $ randomSeq l 0
	RandomSequence l (Just s) -> print $ randomSeq l s

	Motifs k d path -> do
	gs <- getGenomes path
	if gs == []
	then hPutStr stderr ("No Input Strings!")
	else mapM_ (putStrLn . show) (motifs k d gs)

	Median k path -> do
	gs <- getGenomes path
	if gs == []
	then hPutStr stderr ("No Input Strings!")
	else let (score, motifs) = median k gs
	in putStrLn ("Motifs: " ++ show motifs ++ ", score: " ++ show score)

	Profile path -> do
	c <- case path of
	Nothing -> try $ do
	term <- hIsTerminalDevice stdin
	if not term
	then TIO.hGetContents stdin
	else do
	throwIO (IOError (Just stdin) NoSuchThing "" "Error: No pipe from stdin\n" Nothing Nothing)
	return T.empty

	Just path -> try $ TIO.readFile path

	str <- case c of
	Left (IOError (Just stdin) _ _ m _ _) -> do
	hPutStr stderr m
	return T.empty

	Left ex -> do
	let err = show (ex :: IOException)
	hPutStr stderr ("Error:\n " ++ (show ex) ++ "\n")
	throwIO ex

	Right content -> return content

	let ProfileSystem g i m = read (T.unpack str)
	print $ profileMax g i m



	GreedyMotif k path -> do
	gs <- getGenomes path
	if gs == []
	then hPutStr stderr ("No Input Strings!")
	else let motifs = greedyMotif k gs
	in mapM_ (putStrLn . show) motifs

	RandomMotif k trials seed path -> do
	gs <- getGenomes path
	let s = case seed of
	Nothing -> 1
	Just n -> n
	let g = mkStdGen s
	let t = case trials of
	Nothing -> 1
	Just n -> n

	if gs == []
	then hPutStr stderr ("No Input Strings!")
	else let ms = take t $ unfoldr (\g_ -> Just (randomMotifSearch k gs g_)) g
	(m,h) = foldl1' (\a b -> case comparing snd a b of
	LT -> a
	GT -> b
	EQ -> a) ms
	in do mapM_ (putStrLn . show) m
	putStrLn ""
	putStrLn ("Entropy: " ++ show h)

	Entropy path -> do
	gs <- getGenomes path
	print (entropy $ motifToProfile gs)

	where
	opts = info (argparse <**> helper)
	( fullDesc
	<> progDesc "A collection of functions for DNA stuff"
	<> header "dna: A collection of bioinformatics algorithms" )


	example :: [Genome]
	example = [[C,G,C,C,C,C,T,C,T,C,G,G,G,G,G,T,G,T,T,C,A,G,T,A,A,A,C,G,G,C,C,A],
	[G,G,G,C,G,A,G,G,T,A,T,G,T,G,T,A,A,G,T,G,C,C,A,A,G,G,T,G,C,C,A,G],
	[T,A,G,T,A,C,C,G,A,G,A,C,C,G,A,A,A,G,A,A,G,T,A,T,A,C,A,G,G,C,G,T],
	[T,A,G,A,T,C,A,A,G,T,T,T,C,A,G,G,T,G,C,A,C,G,T,C,G,G,T,G,A,A,C,C],
	[A,A,T,C,C,A,C,C,A,G,C,T,C,C,A,C,G,T,G,C,A,A,T,G,T,T,G,G,C,C,T,A] ]