averagehat/Lib.hs

## Lib.hs
module Lib where
import Data.Char(ord,chr,toUpper)
import Data.Word (Word8)
import System.Directory(doesFileExist)
import Control.Applicative
import Bio.Sequence.FastQ (readSangerQ, writeSangerQ)
import Bio.Core.Sequence (BioSeqQual, unSD, seqheader, seqdata, seqqual, unQD, unQual)
import qualified Data.ByteString.Lazy.Char8 as C
import qualified Data.ByteString.Lazy as BB
import Development.Shake
import Development.Shake.Command
import Development.Shake.FilePath
import Development.Shake.Util
import Data.Compressed.LZ78 (encode)


lowComplex :: BioSeqQual s => Double -> s -> Bool
lowComplex n s = compScore < n  where
  compScore    = (ucSize - cSize) / ucSize
  cSize        = length' $ encode uncompressed
  ucSize       = length' uncompressed
  uncompressed = unSD . seqdata s
  length'      = foldr 0 (\(_,z) -> z + 1)

filterPair :: BioSeqQual s => (s -> Bool) -> FilePath -> FilePath -> FilePath -> FilePath -> IO ()
filterPair f in1 in2 o1 o2 = do
  r1 <- readSangerQ in1
  r2 <- readSangerQ in2
  let (r1', r2') = unzip $ filter pred $ zip r1 r2
  writeSangerQ o1 r1'
  writeSangerQ o2 r2'
  where
    pred (fwd, rev) = (f fwd) && (f rev)


["R1.lzw", "R2.lzw"] &%> \[out1, out2] -> do
  let [src1, src2] = [r1 -<.> "deduped", r2 -<.> "deduped"]
  need [src1, src2]
  liftIO $ filterPair (not . (lowComplex 0.45)) src1 src2 out1 out2

"rapsearch.gi" %> \out -> do
  let src = need' "rapsearch.m8"
  rows  <- lines <$> readfile' src
  let rows' = dropwhile \(xs -> (head xs) == '@') rows
  let gids  = map (\x -> head $ splitOn "|" $ head splitOn "\t" x) rows'
  writeFile' out $ T.intercalate "\n" gids

"rapsearch.tsv" %> \out -> do
  let src = need' $ out -<.> "gi"
  acc2tax src out NTDB

data SeqType = NT | NR
acc2tax :: SeqType -> FilePath -> FilePath -> FilePath -> Action ()
acc2tax st fpin fpout = cmd "acc2tax" "--gi" "-i" fpin "-o" fpout "--database" db "--entries" ENTS type'
  where
    ENTS = show 1114054124
    (db, type') = case st of -- also switch to get NR/NT db
      NT -> ("NT", "--nucleotide")
      NR -> ("NR", "--protein")


need' x = do
  need [x]
  return x

## path-pipe-hs.cabal
name:                 path-pipe-hs
version:              0.1.0.0
synopsis:             Pathogen discovery pipeline
-- description:
-- license:
-- license-file:
homepage:             github.com/VDBWRAIR
author:               Michael Panciera
maintainer:           michael.panciera.work@gmail.com
category:             Development
-- copyright:
build-type:           Simple
-- extra-source-files:
cabal-version:        >=1.10

executable path-pipe-hs
  main-is:              Lib.hs
  -- other-modules:
  -- other-extensions:
  build-depends:        base >= 4.7 && < 5
                      , shake
                      , temporary
                      , directory
                      , biofastq
                      , biocore
                      , bytestring
                      , compressed
  hs-source-dirs:       src
  default-language:     Haskell2010
	module Lib where
	import Data.Char(ord,chr,toUpper)
	import Data.Word (Word8)
	import System.Directory(doesFileExist)
	import Control.Applicative
	import Bio.Sequence.FastQ (readSangerQ, writeSangerQ)
	import Bio.Core.Sequence (BioSeqQual, unSD, seqheader, seqdata, seqqual, unQD, unQual)
	import qualified Data.ByteString.Lazy.Char8 as C
	import qualified Data.ByteString.Lazy as BB
	import Development.Shake
	import Development.Shake.Command
	import Development.Shake.FilePath
	import Development.Shake.Util
	import Data.Compressed.LZ78 (encode)


	lowComplex :: BioSeqQual s => Double -> s -> Bool
	lowComplex n s = compScore < n where
	compScore = (ucSize - cSize) / ucSize
	cSize = length' $ encode uncompressed
	ucSize = length' uncompressed
	uncompressed = unSD . seqdata s
	length' = foldr 0 (\(_,z) -> z + 1)

	filterPair :: BioSeqQual s => (s -> Bool) -> FilePath -> FilePath -> FilePath -> FilePath -> IO ()
	filterPair f in1 in2 o1 o2 = do
	r1 <- readSangerQ in1
	r2 <- readSangerQ in2
	let (r1', r2') = unzip $ filter pred $ zip r1 r2
	writeSangerQ o1 r1'
	writeSangerQ o2 r2'
	where
	pred (fwd, rev) = (f fwd) && (f rev)


	["R1.lzw", "R2.lzw"] &%> \[out1, out2] -> do
	let [src1, src2] = [r1 -<.> "deduped", r2 -<.> "deduped"]
	need [src1, src2]
	liftIO $ filterPair (not . (lowComplex 0.45)) src1 src2 out1 out2

	"rapsearch.gi" %> \out -> do
	let src = need' "rapsearch.m8"
	rows <- lines <$> readfile' src
	let rows' = dropwhile \(xs -> (head xs) == '@') rows
	let gids = map (\x -> head $ splitOn "\|" $ head splitOn "\t" x) rows'
	writeFile' out $ T.intercalate "\n" gids

	"rapsearch.tsv" %> \out -> do
	let src = need' $ out -<.> "gi"
	acc2tax src out NTDB

	data SeqType = NT \| NR
	acc2tax :: SeqType -> FilePath -> FilePath -> FilePath -> Action ()
	acc2tax st fpin fpout = cmd "acc2tax" "--gi" "-i" fpin "-o" fpout "--database" db "--entries" ENTS type'
	where
	ENTS = show 1114054124
	(db, type') = case st of -- also switch to get NR/NT db
	NT -> ("NT", "--nucleotide")
	NR -> ("NR", "--protein")


	need' x = do
	need [x]
	return x
	name: path-pipe-hs
	version: 0.1.0.0
	synopsis: Pathogen discovery pipeline
	-- description:
	-- license:
	-- license-file:
	homepage: github.com/VDBWRAIR
	author: Michael Panciera
	maintainer: michael.panciera.work@gmail.com
	category: Development
	-- copyright:
	build-type: Simple
	-- extra-source-files:
	cabal-version: >=1.10

	executable path-pipe-hs
	main-is: Lib.hs
	-- other-modules:
	-- other-extensions:
	build-depends: base >= 4.7 && < 5
	, shake
	, temporary
	, directory
	, biofastq
	, biocore
	, bytestring
	, compressed
	hs-source-dirs: src
	default-language: Haskell2010