Skip to content

Instantly share code, notes, and snippets.

@jfear
Created February 9, 2017 16:30
Show Gist options
  • Save jfear/f2614604386275fd0a66fe0a197d22dd to your computer and use it in GitHub Desktop.
Save jfear/f2614604386275fd0a66fe0a197d22dd to your computer and use it in GitHub Desktop.
# vim: set ft=snakemake:
shell.prefix("""set -euo pipefail;
if [ -e /lscratch/$SLURM_JOBID ]; then
export TMPDIR=/lscratch/$SLURM_JOBID
else
export TMPDIR=/tmp
fi;
""")
workdir: '.'
################################################################################
# Config
################################################################################
sra_file = '/data/Oliverlab/project/remap/data/sra_info_dmel_13050.txt'
BAM_dir = '/data/Oliverlab/project/remap/result/pre_alignment'
hisat2_version = '2.0.4'
stringtie_version = '1.2.3'
samtools_version = '1.3'
rseqc_version = '2.6.3'
splice_file = '/data/Oliverlab/data/FlyBase/FB2015_04/dmel_ERCC.splicesites.txt'
hisat2_index = '/data/Oliverlab/data/FlyBase/FB2015_04/dmel_ERCC'
bed12_file = '/data/Oliverlab/data/FlyBase/FB2015_04/dmel_ERCC.bed12'
################################################################################
# Sample Information
################################################################################
import pandas as pd
sras = pd.read_csv(sra_file, sep='\t', header=None)
#ALL_SAMPLES = sras.iloc[:,0]
ALL_SAMPLES = sras.iloc[:1000,0]
################################################################################
# Targets
################################################################################
ALL_BAM = expand("{path}/{sample}/{sample}.bam", path = BAM_dir, sample = ALL_SAMPLES)
ALL_EXP = expand("{path}/{sample}/{sample}.exp", path = BAM_dir, sample = ALL_SAMPLES)
ALL_BAMSTAT = expand("{path}/{sample}/{sample}.bam_stat", path = BAM_dir, sample = ALL_SAMPLES)
ALL_TIN = expand("{path}/{sample}/{sample}.summary.txt", path = BAM_dir, sample = ALL_SAMPLES)
ALL_COVERAGE = expand("{path}/{sample}/{sample}.geneBodyCoverage.r", path = BAM_dir, sample = ALL_SAMPLES)
#ALL_HTSEQCOUNT = expand(config["BAM_dir"] + "/{sample}/{sample}.htseq_count", sample = ALL_SAMPLES)
rule all:
input: ALL_BAM + ALL_EXP + ALL_BAMSTAT + ALL_TIN + ALL_COVERAGE
# input: ALL_BAM + ALL_EXP + ALL_BAMSTAT + ALL_TIN + ALL_COVERAGE + ALL_LOG + ALL_GTF + ALL_HTSEQCOUNT
localrules: all
################################################################################
# Rules
################################################################################
rule hisat2:
output: bam = '{prefix}/{sample}.bam',
bai = '{prefix}/{sample}.bam.bai'
log: "{prefix}/{sample}.log"
threads: 6
params:
hisat_version = hisat2_version,
samtools_version = samtools_version,
hisat_index = hisat2_index,
sra = '{sample}',
splice_file = splice_file
shell:
"""
module load hisat/{params.hisat_version}
module load samtools/{params.samtools_version}
# Stage Index
if [ ! -e $TMPDIR/references ]; then
mkdir $TMPDIR/references
cp {params.hisat_index}*.ht2 $TMPDIR/references/
fi
REF=$TMPDIR/references/$(basename {params.hisat_index})
# Run Hisat2
hisat2 -p {threads} \\
-x $REF \\
--dta \\
--sra-acc {params.sra} \\
--known-splicesite-infile {params.splice_file} 2> {log} | samtools view -b | \\
samtools sort - \\
-T $TMPDIR/{wildcards.sample}.sort \\
-o $TMPDIR/{wildcards.sample}.bam
samtools index $TMPDIR/{wildcards.sample}.bam
cp $TMPDIR/{wildcards.sample}.bam {output.bam}
cp $TMPDIR/{wildcards.sample}.bam.bai {output.bai}
"""
rule rseqc:
input: '{prefix}/{sample}.bam'
output: infer_exp = '{prefix}/{sample}.exp',
bam_stat = '{prefix}/{sample}.bam_stat',
tin_txt = '{prefix}/{sample}.summary.txt',
tin_tsv = '{prefix}/{sample}.tin.tsv',
coverage = '{prefix}/{sample}.geneBodyCoverage.r'
params:
rseqc_version = rseqc_version,
bed12_file = bed12_file,
prefix = '{prefix}/{sample}'
shell:
"""
module load rseqc/{params.rseqc_version}
module load R
# Move to TMPDIR
cd $TMPDIR
BED12=$TMPDIR/$(basename {params.bed12_file})
if [ ! -e $BED12 ]; then
cp {params.bed12_file} $BED12
fi
BAM=$TMPDIR/$(basename {input[0]})
if [ ! -e $BAM ]; then
cp {input[0]} $BAM
cp {input[0]}.bai $BAM.bai
fi
# Infer Experiment
infer_experiment.py -i $BAM -r $BED12 > {output.infer_exp}
# BAM stat
bam_stat.py -i $BAM > {output.bam_stat}
# TIN
tin.py -i $BAM -r $BED12
cp {wildcards.sample}.summary.txt {output.tin_txt}
cp {wildcards.sample}.tin.xls {output.tin_tsv}
# Coverage
geneBody_coverage.py -i $BAM -r $BED12 -o {params.prefix}
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment