Created
December 17, 2016 17:22
-
-
Save averagehat/f709f94339626c919edbf2b8726a0d4d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Lib where | |
import Data.Char(ord,chr,toUpper) | |
import Data.Word (Word8) | |
import System.Directory(doesFileExist) | |
import Control.Applicative | |
import Bio.Sequence.FastQ (readSangerQ, writeSangerQ) | |
import Bio.Core.Sequence (BioSeqQual, unSD, seqheader, seqdata, seqqual, unQD, unQual) | |
import qualified Data.ByteString.Lazy.Char8 as C | |
import qualified Data.ByteString.Lazy as BB | |
import Development.Shake | |
import Development.Shake.Command | |
import Development.Shake.FilePath | |
import Development.Shake.Util | |
import Data.Compressed.LZ78 (encode) | |
lowComplex :: BioSeqQual s => Double -> s -> Bool | |
lowComplex n s = compScore < n where | |
compScore = (ucSize - cSize) / ucSize | |
cSize = length' $ encode uncompressed | |
ucSize = length' uncompressed | |
uncompressed = unSD . seqdata s | |
length' = foldr 0 (\(_,z) -> z + 1) | |
filterPair :: BioSeqQual s => (s -> Bool) -> FilePath -> FilePath -> FilePath -> FilePath -> IO () | |
filterPair f in1 in2 o1 o2 = do | |
r1 <- readSangerQ in1 | |
r2 <- readSangerQ in2 | |
let (r1', r2') = unzip $ filter pred $ zip r1 r2 | |
writeSangerQ o1 r1' | |
writeSangerQ o2 r2' | |
where | |
pred (fwd, rev) = (f fwd) && (f rev) | |
["R1.lzw", "R2.lzw"] &%> \[out1, out2] -> do | |
let [src1, src2] = [r1 -<.> "deduped", r2 -<.> "deduped"] | |
need [src1, src2] | |
liftIO $ filterPair (not . (lowComplex 0.45)) src1 src2 out1 out2 | |
"rapsearch.gi" %> \out -> do | |
let src = need' "rapsearch.m8" | |
rows <- lines <$> readfile' src | |
let rows' = dropwhile \(xs -> (head xs) == '@') rows | |
let gids = map (\x -> head $ splitOn "|" $ head splitOn "\t" x) rows' | |
writeFile' out $ T.intercalate "\n" gids | |
"rapsearch.tsv" %> \out -> do | |
let src = need' $ out -<.> "gi" | |
acc2tax src out NTDB | |
data SeqType = NT | NR | |
acc2tax :: SeqType -> FilePath -> FilePath -> FilePath -> Action () | |
acc2tax st fpin fpout = cmd "acc2tax" "--gi" "-i" fpin "-o" fpout "--database" db "--entries" ENTS type' | |
where | |
ENTS = show 1114054124 | |
(db, type') = case st of -- also switch to get NR/NT db | |
NT -> ("NT", "--nucleotide") | |
NR -> ("NR", "--protein") | |
need' x = do | |
need [x] | |
return x |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: path-pipe-hs | |
version: 0.1.0.0 | |
synopsis: Pathogen discovery pipeline | |
-- description: | |
-- license: | |
-- license-file: | |
homepage: github.com/VDBWRAIR | |
author: Michael Panciera | |
maintainer: michael.panciera.work@gmail.com | |
category: Development | |
-- copyright: | |
build-type: Simple | |
-- extra-source-files: | |
cabal-version: >=1.10 | |
executable path-pipe-hs | |
main-is: Lib.hs | |
-- other-modules: | |
-- other-extensions: | |
build-depends: base >= 4.7 && < 5 | |
, shake | |
, temporary | |
, directory | |
, biofastq | |
, biocore | |
, bytestring | |
, compressed | |
hs-source-dirs: src | |
default-language: Haskell2010 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment