These notes assume you have a single lane of sequencing.
These notes assume you are using bash.
Make sure your ~/.softwarerc file contains (at a minimum):
team31
irods
sanger-samtools-refpath
Firstly, login to farm3-login and create a new directory in one of your Lustre directories and change into it. For example:
mkdir /lustre/scratch109/sanger/ip7/traf2-rnaseq
cd /lustre/scratch109/sanger/ip7/traf2-rnaseq
Secondly, set an environment variable representing the run/lane. For example:
run_lane=15367_4
Authenticate to iRODS:
kinit
Download CRAM files and their metadata:
mkdir $run_lane
run=`echo $run_lane | sed -e 's/_.*//'`
lane=`echo $run_lane | sed -e 's/.*_//'`
/software/irods/icommands/bin/imeta qu -z seq -d id_run = $run and lane = $lane and target = 1 and type = cram \
| grep : | awk '{ print $2 }' | paste - - -d/ \
| xargs -ixxx /software/irods/icommands/bin/iget -K xxx $run_lane
chmod 664 $run_lane/*
find $run_lane | grep -E '#(168|888)\.' | xargs rm
for cram in $(find $run_lane | grep cram$ | sed -e 's/.*\///' | sed -e 's/\.cram$//')
do
/software/irods/icommands/bin/imeta ls -d /seq/$run/$cram.cram > $run_lane/$cram.imeta
done
ls $run_lane/*.cram | sed -e 's/.cram$//' | xargs -ixxx \
bsub -o xxx.cram2bam.o -e xxx.cram2bam.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
"/software/team31/bin/samtools-1.2 view -b xxx.cram > xxx.bam"
Check jobs ran OK:
ls $run_lane/*.cram | wc -l && \
grep -l 'Successfully completed' $run_lane/*.cram2bam.o | wc -l && \
grep -l 'Exited' $run_lane/*.cram2bam.o | wc -l
rm $run_lane/*.cram
Get list of samples:
grep -A1 sample$ $run_lane/*.imeta | grep value | sed -e 's/.imeta-value: /.bam:/' \
> samples.txt
Extract FASTQ:
mkdir fastq
for line in `cat samples.txt`
do
bam=`echo $line | sed -e 's/:.*//'`
sample=`echo $line | sed -e 's/.*://'`
bsub -o fastq/$sample.o -e fastq/$sample.e \
-R'select[mem>500] rusage[mem=500]' -M500 \
/software/hpag/bin/bamtofastq \
F=fastq/$sample.1.fastq \
F2=fastq/$sample.2.fastq \
filename=$bam
done
Check jobs ran OK:
ls $run_lane/*.bam | wc -l && \
grep -l 'Successfully completed' fastq/*.o | wc -l && \
grep -l 'Exited' fastq/*.o | wc -l
Confirm correct number of reads extracted:
echo -e "Sample\tCRAM\tFASTQ" && \
for line in `grep -A1 -E 'total_reads$|sample$' $run_lane/*.imeta | grep imeta | paste - - | sort -V \
| awk '{ print $4 }' | paste - - | awk '{ print $1 ":" $2 }'`
do
sample=`echo $line | sed -e 's/:.*//'`
count1=`echo $line | sed -e 's/.*://'`
count2=$((`cat fastq/$sample.?.fastq | wc -l` / 4))
echo -e "$sample\t$count1\t$count2"
done
rm $run_lane/*.bam
bsub -n16 -o fastq/fastqc.o -e fastq/fastqc.e \
-R'span[hosts=1] select[mem>4000] rusage[mem=4000]' -M4000 \
perl /software/team31/packages/FastQC/fastqc --threads 16 fastq/*.fastq
Check job ran OK:
echo 1 && \
grep -l 'Successfully completed' fastq/fastqc.o | wc -l && \
grep -l 'Exited' fastq/fastqc.o | wc -l
Check HTML files to ensure reads don't need trimming (of adapters or for quality).
mkdir ref
cd ref
wget ftp://ftp.ensembl.org/pub/release-78/gtf/mus_musculus/Mus_musculus.GRCm38.78.gtf.gz
gzip -d Mus_musculus.GRCm38.78.gtf.gz
wget ftp://ftp.ensembl.org/pub/release-78/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz
gzip -d Mus_musculus.GRCm38.dna.primary_assembly.fa.gz
cd ..
cd ref
bsub -o bowtie2-build.o -e bowtie2-build.e \
-R'select[mem>10000] rusage[mem=10000]' -M10000 \
"/software/team31/bin/bowtie2-build Mus_musculus.GRCm38.dna.primary_assembly.fa Mus_musculus.GRCm38.dna.primary_assembly \
> bowtie2-build.log"
bsub -o faidx.o -e faidx.e samtools faidx Mus_musculus.GRCm38.dna.primary_assembly.fa
bsub -o CreateSequenceDictionary.o -e CreateSequenceDictionary.e \
-R'select[mem>4000] rusage[mem=4000]' -M4000 \
/software/java/bin/java -jar /software/team31/packages/picard-tools/CreateSequenceDictionary.jar \
R=Mus_musculus.GRCm38.dna.primary_assembly.fa \
O=Mus_musculus.GRCm38.dna.primary_assembly.fa.dict
cd ..
Check jobs ran OK:
echo 3 && \
grep -l 'Successfully completed' ref/*.o | wc -l && \
grep -l 'Exited' ref/*.o | wc -l
cd ref
bsub -o tophat.o -e tophat.e \
-R'select[mem>2000] rusage[mem=2000]' -M2000 \
/software/team31/packages/tophat-2.0.13/bin/tophat2 -G Mus_musculus.GRCm38.78.gtf \
-o Mus_musculus.GRCm38.78 --transcriptome-index Mus_musculus.GRCm38.78 Mus_musculus.GRCm38.dna.primary_assembly
cd ..
Check jobs ran OK:
echo 1 && \
grep -l 'Successfully completed' ref/tophat.o | wc -l && \
grep -l 'Exited' ref/tophat.o | wc -l
mkdir tophat
for sample in `ls fastq/*.1.fastq | sed -e 's/\.1.fastq//' | sed -e 's/^fastq\///'`
do
mkdir tophat/$sample
bsub \
-o tophat/$sample/tophat.o -e tophat/$sample/tophat.e \
-q long \
-R'select[mem>6000] rusage[mem=6000]' -M6000 \
"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/software/team31/packages/boost-1.56.0/lib \
/software/team31/packages/tophat-2.0.13/bin/tophat2 \
--output-dir tophat/$sample \
--GTF ref/Mus_musculus.GRCm38.78.gtf \
--library-type fr-firststrand \
--transcriptome-index ref/Mus_musculus.GRCm38.78/Mus_musculus.GRCm38.78 \
ref/Mus_musculus.GRCm38.dna.primary_assembly \
fastq/$sample.1.fastq \
fastq/$sample.2.fastq"
done
Check jobs ran OK:
ls fastq/*.1.fastq | wc -l && \
grep -l 'Successfully completed' tophat/*/tophat.o | wc -l && \
grep -l 'Exited' tophat/*/tophat.o | wc -l
rm fastq/*.fastq
ls tophat/*/accepted_hits.bam | sed -e 's/.bam$//' | xargs -ixxx \
bsub -o xxx.index.o -e xxx.index.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
/software/team31/bin/samtools index xxx.bam
Check jobs ran OK:
ls tophat/*/accepted_hits.bam | wc -l && \
grep -l 'Successfully completed' tophat/*/accepted_hits.index.o | wc -l && \
grep -l 'Exited' tophat/*/accepted_hits.index.o | wc -l
Mus_musculus.GRCm38.78.gtf has lines containing attributes like:
gene_name "FMN1;"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "FMN1;-201";
HTSeq assumes no semicolons in attribute values, so remove with:
sed -e 's/;";/";/' ref/Mus_musculus.GRCm38.78.gtf | sed -e 's/;-/-/' > ref/Mus_musculus.GRCm38.78.fixed.gtf
Also, remove retained intron transcripts:
grep -v retained_intron ref/Mus_musculus.GRCm38.78.fixed.gtf > ref/Mus_musculus.GRCm38.78.fixed.noretained.gtf
for base in `ls tophat/*/accepted_hits.bam | sed -e 's/\/accepted_hits.bam$//'`
do
bsub -o $base.htseq-count.o -e $base.htseq-count.e \
-n2 -R'span[hosts=1] select[mem>2000] rusage[mem=2000]' -M2000 \
"/software/team31/bin/samtools-0.1.19 sort -n -o $base/accepted_hits.bam $base.sorted \
| /software/team31/bin/samtools-0.1.19 view - \
| /software/team31/python/bin/htseq-count --stranded=reverse - ref/Mus_musculus.GRCm38.78.fixed.noretained.gtf \
> $base.count"
done
Check jobs ran OK:
ls tophat/*/accepted_hits.bam | wc -l && \
grep -l 'Successfully completed' tophat/*.htseq-count.o | wc -l && \
grep -l 'Exited' tophat/*.htseq-count.o | wc -l
(Ensure Ensembl version matches version mapped to.)
bsub -o annotation.o -e annotation.e \
-R'select[mem>500] rusage[mem=500]' -M500 \
"perl -I/software/team31/ensembl/branch-ensembl-78/ensembl/modules \
~is1/checkouts/bio-misc/get_ensembl_gene_annotation.pl \
--species Mus_musculus > annotation.txt"
Check job ran OK:
echo 1 && \
grep -l 'Successfully completed' annotation.o | wc -l && \
grep -l 'Exited' annotation.o | wc -l
mkdir deseq2
awk '{ print $1 }' `ls tophat/*.count | head -1` | grep ENS | sort \
> deseq2/counts.txt
cat /dev/null > deseq2/counts-header.txt
echo -e "\tcondition" > deseq2/samples.txt
for sample in `ls tophat/*.count | sed -e 's/tophat\///' | sed -e 's/.count$//'`
do
condition=`echo $sample | sed -e 's/[0-9]*$//'`
echo -ne "\t$sample" >> deseq2/counts-header.txt
join -j1 deseq2/counts.txt <(sort tophat/$sample.count) \
> deseq2/counts.tmp
mv deseq2/counts.tmp deseq2/counts.txt
echo -e "$sample\t$condition" >> deseq2/samples.txt
done
cat deseq2/counts-header.txt <(echo) deseq2/counts.txt \
| sed -e 's/ /\t/g' > deseq2/counts.tmp
rm deseq2/counts-header.txt
mv deseq2/counts.tmp deseq2/counts.txt
Alternatively, if you have multiple experiments, then you'll need to split the counts.txt and samples.txt files up. Firstly, set an environment variable representing the experiments. For example:
expts="Bap1 Ift140"
Then split the files into separate directories:
for expt in $expts
do
mkdir -p $expt/deseq2
awk '{ print $1 }' `ls tophat/*.count | head -1` | grep ENS | sort \
> $expt/deseq2/counts.txt
cat /dev/null > $expt/deseq2/counts-header.txt
echo -e "\tcondition" > $expt/deseq2/samples.txt
for sample in `ls tophat/*.count | sed -e 's/tophat\///' | sed -e 's/.count$//' | grep ^$expt`
do
condition=`echo $sample | sed -e 's/[0-9]*$//'`
echo -ne "\t$sample" >> $expt/deseq2/counts-header.txt
join -j1 $expt/deseq2/counts.txt <(sort tophat/$sample.count) \
> $expt/deseq2/counts.tmp
mv $expt/deseq2/counts.tmp $expt/deseq2/counts.txt
echo -e "$sample\t$condition" >> $expt/deseq2/samples.txt
done
cat $expt/deseq2/counts-header.txt <(echo) $expt/deseq2/counts.txt \
| sed -e 's/ /\t/g' > $expt/deseq2/counts.tmp
rm $expt/deseq2/counts-header.txt
mv $expt/deseq2/counts.tmp $expt/deseq2/counts.txt
done
Check that deseq2/samples.txt reflects your experiment correctly. The conditions may need editing because they are derived by simply removing numeric suffixes from the sample names. Also if you have a multifactor design then you may need to add more columns (and adjust the design below). If your conditions aren't "mutant" and "sibling" then you'll need to adjust the contrasts below.
If you have multiple experiments (see above), then cd into each experiment's directory before running R.
R-3.1.2
# Load libraries
library(DESeq2)
library(RColorBrewer)
library(gplots)
# Load data
countData <- read.table( "deseq2/counts.txt", header=TRUE, row.names=1 )
samples <- read.table( "deseq2/samples.txt", header=TRUE, row.names=1 )
# Check counts and samples match
print(data.frame(names(countData), rownames(samples)))
# If you have multiple factors then the design below will need to be changed
dds <- DESeqDataSetFromMatrix(countData, samples, design = ~ condition)
# Run DESeq2
dds <- DESeq(dds)
# Write out normalisation size factors
write.table(sizeFactors(dds), file="deseq2/size-factors.txt", col.names=FALSE, quote=FALSE, sep="\t")
# Write out normalised counts
write.table(counts(dds, normalized=TRUE), file="deseq2/normalised-counts.txt", col.names=FALSE, quote=FALSE, sep="\t")
# Write out results for specified pair of conditions
# This section should be repeated if you want to look at other pairs of conditions
res <- results(dds, contrast=c("condition", "mutant", "sibling"))
out <- data.frame(pvalue=res$pvalue, padj=res$padj, log2fc=res$log2FoldChange, row.names=rownames(res))
write.table(out, file="deseq2/sibling_vs_mutant.txt", col.names=FALSE, row.names=TRUE, quote=FALSE, sep="\t")
# Plot some QC graphs
pdf("deseq2/qc.pdf")
plotMA(dds)
rld <- rlogTransformation(dds, blind=TRUE)
select <- order(rowMeans(counts(dds, normalized=TRUE)), decreasing=TRUE)[1:30]
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
heatmap.2(assay(rld)[select,], col = hmcol, Rowv = FALSE, Colv = FALSE, scale="none", dendrogram="none", trace="none", margin=c(10, 6))
heatmap.2(as.matrix(dist(t(assay(rld)))), trace="none", col = rev(hmcol), margin=c(13, 13))
print(plotPCA(rld, intgroup=c("condition")))
plotDispEsts(dds)
dev.off()
# Exit R
quit()
If you have multiple experiments (see above), then cd into each experiment's directory first.
for file in `ls deseq2/*_vs_*.txt`
do
newfile=`echo $file | sed -e 's/deseq2\///' | sed -e 's/txt$/tsv/'`
newsigfile=`echo $file | sed -e 's/deseq2\///' | sed -e 's/txt$/sig.tsv/'`
echo -ne "Gene ID\tpval\tadjp\tlog2fc\tChr\tStart\tEnd\tStrand\tBiotype\tName\tDescription" > $newfile
echo -ne "Gene ID\tpval\tadjp\tlog2fc\tChr\tStart\tEnd\tStrand\tBiotype\tName\tDescription" > $newsigfile
for sample in `head -1 deseq2/counts.txt`
do
echo -ne "\t$sample count" >> $newfile
done
for sample in `head -1 deseq2/counts.txt`
do
echo -ne "\t$sample normalised count" >> $newfile
done
echo >> $newfile
join -j1 -t $'\t' $file annotation.txt \
| join -j1 -t $'\t' - deseq2/counts.txt \
| join -j1 -t $'\t' - deseq2/normalised-counts.txt \
| sort -gk3,3 -gk2,2 \
>> $newfile
awk '{ if ($2 != "NA" && $3 != "NA") print $0 }' $newfile > $newfile.tmp
awk '{ if ($2 != "NA" && $3 == "NA") print $0 }' $newfile >> $newfile.tmp
awk '{ if ($2 == "NA" && $3 == "NA") print $0 }' $newfile >> $newfile.tmp
mv $newfile.tmp $newfile
head -1 $newfile > $newsigfile
awk '{ if ($3 < 0.05) print $0 }' $newfile >> $newsigfile
done
If you have multiple experiments (see above), then cd into each experiment's directory first.
for file in `ls *.tsv | grep -v sig.tsv$`
do
dir=`echo $file | sed -e 's/tsv$/go/'`
bsub -o $dir/go.o -e $dir/go.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
perl -I/software/team31/packages/topgo-wrapper/lib \
/software/team31/packages/topgo-wrapper/script/run_topgo.pl \
--dir $dir \
--input_file $file \
--gene_field 1 \
--p_value_field 3 \
--go_terms_file /software/team31/packages/topgo-wrapper/data/mus_musculus_e78_go.txt \
--header 1
done
Check jobs ran OK:
ls *.tsv | grep -v sig.tsv$ | wc -l && \
grep -l 'Successfully completed' *.go/go.o | wc -l && \
grep -l 'Exited' *.go/go.o | wc -l
bsub -o ref/dexseq_prepare_annotation.o -e ref/dexseq_prepare_annotation.e \
-R'span[hosts=1] select[mem>2000] rusage[mem=2000]' -M2000 \
/software/bin/python2.7 /software/team31/R/DEXSeq/python_scripts/dexseq_prepare_annotation.py \
ref/Mus_musculus.GRCm38.78.fixed.noretained.gtf \
ref/Mus_musculus.GRCm38.78.fixed.noretained.dexseq.gff
Check jobs ran OK:
echo 1 && \
grep -l 'Successfully completed' ref/dexseq_prepare_annotation.o | wc -l && \
grep -l 'Exited' ref/dexseq_prepare_annotation.o | wc -l
for base in `ls tophat/*/accepted_hits.bam | sed -e 's/\/accepted_hits.bam$//'`
do
bsub -o $base.dexseq-count.o -e $base.dexseq-count.e \
-R'select[mem>2000] rusage[mem=2000]' -M2000 \
/software/bin/python2.7 /software/team31/R/DEXSeq/python_scripts/dexseq_count.py \
-f bam -p yes -s reverse -r pos \
ref/Mus_musculus.GRCm38.78.fixed.noretained.dexseq.gff \
$base/accepted_hits.bam \
$base.exon.count
done
Check jobs ran OK:
ls tophat/*/accepted_hits.bam | wc -l && \
grep -l 'Successfully completed' tophat/*.dexseq-count.o | wc -l && \
grep -l 'Exited' tophat/*.dexseq-count.o | wc -l
mkdir dexseq
cp deseq2/samples.txt dexseq
The commands below are for designs with a single factor called condition. If your conditions aren't "mutant" and "sibling" then you'll need to make adjustments.
If you have multiple experiments (see above), then cd into each experiment's directory before running R.
bsub -Is -n8 -q long \
-R'span[hosts=1] select[mem>28000] rusage[mem=28000]' -M28000 \
/software/R-3.1.2/bin/R
# Load libraries
library(DEXSeq)
# Parallelise
BPPARAM = MulticoreParam(workers=8)
# Load data
countFiles = list.files('tophat', pattern="exon.count$", full.names=TRUE)
samples <- read.table( "dexseq/samples.txt", header=TRUE, row.names=1 )
# If you have multiple factors then the design below will need to be changed
dxd = DEXSeqDataSetFromHTSeq(countFiles, sampleData=samples,
design= ~ sample + exon + condition:exon,
flattenedfile='ref/Mus_musculus.GRCm38.78.fixed.noretained.dexseq.gff')
# Run DEXSeq
# Normalise
dxd = estimateSizeFactors(dxd)
# Estimate dispersions
dxd = estimateDispersions(dxd, BPPARAM=BPPARAM)
# Test for differential exon usage
dxd = testForDEU(dxd, BPPARAM=BPPARAM)
# Estimate exon fold changes
dxd = estimateExonFoldChanges(dxd, fitExpToVar="condition", BPPARAM=BPPARAM)
# Get results
dxr = DEXSeqResults(dxd)
# Write out normalisation size factors
write.table(sizeFactors(dxd), file="dexseq/size-factors.txt", col.names=FALSE, quote=FALSE, sep="\t")
# Write out counts
write.table(featureCounts(dxd, normalized=FALSE), file="dexseq/counts.txt", col.names=FALSE, quote=FALSE, sep="\t")
# Write out normalised counts
write.table(featureCounts(dxd, normalized=TRUE), file="dexseq/normalised-counts.txt", col.names=FALSE, quote=FALSE, sep="\t")
# Write out results
out <- data.frame(pvalue=dxr$pvalue, padj=dxr$padj, log2fc=dxr$log2fold_mutant_sibling, row.names=rownames(dxr))
write.table(out, file="dexseq/sibling_vs_mutant.txt", col.names=FALSE, row.names=TRUE, quote=FALSE, sep="\t")
# Plot some QC graphs
pdf("dexseq/qc.pdf")
plotMA(dxr, cex=0.8)
plotDispEsts(dxd)
dev.off()
# Output HTML
DEXSeqHTML(dxr, FDR=0.05, color=c("#FF000080", "#0000FF80"), path="dexseq", file="output.html")
# Exit R
quit()