These notes assume you have multiple lanes of sequencing.
These notes assume you are using bash.
Make sure your ~/.softwarerc file contains (at a minimum):
team31
irods
sanger-samtools-refpath
Firstly, login to farm3-login and create a new directory in one of your Lustre directories and change into it. For example:
mkdir /lustre/scratch109/sanger/is1/cardio-rnaseq
cd /lustre/scratch109/sanger/is1/cardio-rnaseq
Secondly, set an environment variable representing the run/lanes. For example:
run_lanes="15475_1 15475_2"
Authenticate to iRODS:
kinit
Download CRAM files and their metadata:
for run_lane in $run_lanes
do
mkdir $run_lane
run=`echo $run_lane | sed -e 's/_.*//'`
lane=`echo $run_lane | sed -e 's/.*_//'`
/software/irods/icommands/bin/imeta qu -z seq -d id_run = $run and lane = $lane and target = 1 and type = cram \
| grep : | awk '{ print $2 }' | paste - - -d/ \
| xargs -ixxx /software/irods/icommands/bin/iget -K xxx $run_lane
chmod 664 $run_lane/*
find $run_lane | grep -E '#(168|888)\.' | xargs rm
for cram in $(find $run_lane | grep cram$ | sed -e 's/.*\///' | sed -e 's/\.cram$//')
do
/software/irods/icommands/bin/imeta ls -d /seq/$run/$cram.cram > $run_lane/$cram.imeta
done
done
ls */*.cram | sed -e 's/.cram$//' | xargs -ixxx \
bsub -o xxx.cram2bam.o -e xxx.cram2bam.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
"/software/team31/bin/samtools-1.2 view -b xxx.cram > xxx.bam"
Check jobs ran OK:
ls */*.cram | wc -l && \
grep -l 'Successfully completed' */*.cram2bam.o | wc -l && \
grep -l 'Exited' */*.cram2bam.o | wc -l
for run_lane in $run_lanes
do
rm $run_lane/*.cram
done
Get list of samples:
for run_lane in $run_lanes
do
grep -A1 sample$ $run_lane/*.imeta | grep value | sed -e 's/.imeta-value: /.bam:/' \
> $run_lane.samples.txt
done
Extract FASTQ:
for run_lane in $run_lanes
do
mkdir $run_lane.fastq
for line in `cat $run_lane.samples.txt`
do
bam=`echo $line | sed -e 's/:.*//'`
sample=`echo $line | sed -e 's/.*://'`
bsub -o $run_lane.fastq/$sample.o -e $run_lane.fastq/$sample.e \
-R'select[mem>500] rusage[mem=500]' -M500 \
/software/hpag/bin/bamtofastq \
F=$run_lane.fastq/$sample.1.fastq \
F2=$run_lane.fastq/$sample.2.fastq \
filename=$bam
done
done
Check jobs ran OK:
ls */*.bam | wc -l && \
grep -l 'Successfully completed' *.fastq/*.o | wc -l && \
grep -l 'Exited' *.fastq/*.o | wc -l
Confirm correct number of reads extracted:
echo -e "Sample\tCRAM\tFASTQ" && \
for run_lane in $run_lanes
do
for line in `grep -A1 -E 'total_reads$|sample$' $run_lane/*.imeta | grep imeta | paste - - | sort -V \
| awk '{ print $4 }' | paste - - | awk '{ print $1 ":" $2 }'`
do
sample=`echo $line | sed -e 's/:.*//'`
count1=`echo $line | sed -e 's/.*://'`
count2=$((`cat $run_lane.fastq/$sample.?.fastq | wc -l` / 4))
echo -e "$sample\t$count1\t$count2"
done
done
for run_lane in $run_lanes
do
rm $run_lane/*.bam
done
for run_lane in $run_lanes
do
bsub -n16 -o $run_lane.fastq/fastqc.o -e $run_lane.fastq/fastqc.e \
-R'span[hosts=1] select[mem>4000] rusage[mem=4000]' -M4000 \
perl /software/team31/packages/FastQC/fastqc --threads 16 $run_lane.fastq/*.fastq
done
Check jobs ran OK:
ls -d *.fastq | wc -l && \
grep -l 'Successfully completed' *.fastq/fastqc.o | wc -l && \
grep -l 'Exited' *.fastq/fastqc.o | wc -l
Check HTML files to ensure reads don't need trimming (of adapters or for quality).
mkdir fastq
for sample in `sed -e 's/.*://' *.samples.txt | sort -u`
do
for read in 1 2
do
bsub -o fastq/$sample.$read.merge.o -e fastq/$sample.$read.merge.e \
"cat `ls *.fastq/$sample.$read.fastq | sort | tr '\n' ' '` > fastq/$sample.$read.fastq"
done
done
Check jobs ran OK:
echo $((`sed -e 's/.*://' *.samples.txt | sort -u | wc -l` * 2)) && \
grep -l 'Successfully completed' fastq/*.o | wc -l && \
grep -l 'Exited' fastq/*.o | wc -l
for run_lane in $run_lanes
do
rm -rf $run_lane.fastq
done
mkdir ref
cd ref
wget ftp://ftp.ensembl.org/pub/release-78/gtf/homo_sapiens/Homo_sapiens.GRCh38.78.gtf.gz
gzip -d Homo_sapiens.GRCh38.78.gtf.gz
wget ftp://ftp.ensembl.org/pub/release-78/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
gzip -d Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
cd ..
cd ref
bsub -o bowtie2-build.o -e bowtie2-build.e \
-R'select[mem>10000] rusage[mem=10000]' -M10000 \
"/software/team31/bin/bowtie2-build Homo_sapiens.GRCh38.dna.primary_assembly.fa Homo_sapiens.GRCh38.dna.primary_assembly \
> bowtie2-build.log"
bsub -o faidx.o -e faidx.e samtools faidx Homo_sapiens.GRCh38.dna.primary_assembly.fa
bsub -o CreateSequenceDictionary.o -e CreateSequenceDictionary.e \
-R'select[mem>4000] rusage[mem=4000]' -M4000 \
/software/java/bin/java -jar /software/team31/packages/picard-tools/CreateSequenceDictionary.jar \
R=Homo_sapiens.GRCh38.dna.primary_assembly.fa \
O=Homo_sapiens.GRCh38.dna.primary_assembly.fa.dict
cd ..
Check jobs ran OK:
echo 3 && \
grep -l 'Successfully completed' ref/*.o | wc -l && \
grep -l 'Exited' ref/*.o | wc -l
cd ref
bsub -o tophat.o -e tophat.e \
-R'select[mem>2000] rusage[mem=2000]' -M2000 \
/software/team31/packages/tophat-2.0.13/bin/tophat2 -G Homo_sapiens.GRCh38.78.gtf \
-o Homo_sapiens.GRCh38.78 --transcriptome-index Homo_sapiens.GRCh38.78 Homo_sapiens.GRCh38.dna.primary_assembly
cd ..
Check jobs ran OK:
echo 1 && \
grep -l 'Successfully completed' ref/tophat.o | wc -l && \
grep -l 'Exited' ref/tophat.o | wc -l
mkdir tophat
for sample in `ls fastq/*.1.fastq | sed -e 's/\.1.fastq//' | sed -e 's/^fastq\///'`
do
mkdir tophat/$sample
bsub \
-o tophat/$sample/tophat.o -e tophat/$sample/tophat.e \
-q long \
-R'select[mem>6000] rusage[mem=6000]' -M6000 \
"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/software/team31/packages/boost-1.56.0/lib \
/software/team31/packages/tophat-2.0.13/bin/tophat2 \
--output-dir tophat/$sample \
--GTF ref/Homo_sapiens.GRCh38.78.gtf \
--library-type fr-firststrand \
--transcriptome-index ref/Homo_sapiens.GRCh38.78/Homo_sapiens.GRCh38.78 \
ref/Homo_sapiens.GRCh38.dna.primary_assembly \
fastq/$sample.1.fastq \
fastq/$sample.2.fastq"
done
Check jobs ran OK:
ls fastq/*.1.fastq | wc -l && \
grep -l 'Successfully completed' tophat/*/tophat.o | wc -l && \
grep -l 'Exited' tophat/*/tophat.o | wc -l
rm fastq/*.fastq
ls tophat/*/accepted_hits.bam | sed -e 's/.bam$//' | xargs -ixxx \
bsub -o xxx.index.o -e xxx.index.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
/software/team31/bin/samtools index xxx.bam
Check jobs ran OK:
ls tophat/*/accepted_hits.bam | wc -l && \
grep -l 'Successfully completed' tophat/*/accepted_hits.index.o | wc -l && \
grep -l 'Exited' tophat/*/accepted_hits.index.o | wc -l
Homo_sapiens.GRCh38.78.gtf has lines containing attributes like:
gene_name "PRAMEF6;"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "PRAMEF6;-201";
HTSeq assumes no semicolons in attribute values, so remove with:
sed -e 's/;";/";/' ref/Homo_sapiens.GRCh38.78.gtf | sed -e 's/;-/-/' > ref/Homo_sapiens.GRCh38.78.fixed.gtf
Also, remove retained intron transcripts:
grep -v retained_intron ref/Homo_sapiens.GRCh38.78.fixed.gtf > ref/Homo_sapiens.GRCh38.78.fixed.noretained.gtf
for base in `ls tophat/*/accepted_hits.bam | sed -e 's/\/accepted_hits.bam$//'`
do
bsub -o $base.htseq-count.o -e $base.htseq-count.e \
-n2 -R'span[hosts=1] select[mem>2000] rusage[mem=2000]' -M2000 \
"/software/team31/bin/samtools-0.1.19 sort -n -o $base/accepted_hits.bam $base.sorted \
| /software/team31/bin/samtools-0.1.19 view - \
| /software/team31/python/bin/htseq-count --stranded=reverse - ref/Homo_sapiens.GRCh38.78.fixed.noretained.gtf \
> $base.count"
done
Check jobs ran OK:
ls tophat/*/accepted_hits.bam | wc -l && \
grep -l 'Successfully completed' tophat/*.htseq-count.o | wc -l && \
grep -l 'Exited' tophat/*.htseq-count.o | wc -l
(Ensure Ensembl version matches version mapped to.)
bsub -o annotation.o -e annotation.e \
-R'select[mem>500] rusage[mem=500]' -M500 \
"perl -I/software/team31/ensembl/branch-ensembl-78/ensembl/modules \
~is1/checkouts/bio-misc/get_ensembl_gene_annotation.pl \
--species Homo_sapiens > annotation.txt"
Check job ran OK:
echo 1 && \
grep -l 'Successfully completed' annotation.o | wc -l && \
grep -l 'Exited' annotation.o | wc -l
mkdir deseq2
awk '{ print $1 }' `ls tophat/*.count | head -1` | grep ENS | sort \
> deseq2/counts.txt
cat /dev/null > deseq2/counts-header.txt
echo -e "\tcondition" > deseq2/samples.txt
for sample in `ls tophat/*.count | sed -e 's/tophat\///' | sed -e 's/.count$//'`
do
condition=`echo $sample | sed -e 's/[0-9]*$//'`
echo -ne "\t$sample" >> deseq2/counts-header.txt
join -j1 deseq2/counts.txt <(sort tophat/$sample.count) \
> deseq2/counts.tmp
mv deseq2/counts.tmp deseq2/counts.txt
echo -e "$sample\t$condition" >> deseq2/samples.txt
done
cat deseq2/counts-header.txt <(echo) deseq2/counts.txt \
| sed -e 's/ /\t/g' > deseq2/counts.tmp
rm deseq2/counts-header.txt
mv deseq2/counts.tmp deseq2/counts.txt
Alternatively, if you have multiple experiments, then you'll need to split the counts.txt and samples.txt files up. Firstly, set an environment variable representing the experiments. For example:
expts="Bap1 Ift140"
Then split the files into separate directories:
for expt in $expts
do
mkdir -p $expt/deseq2
awk '{ print $1 }' `ls tophat/*.count | head -1` | grep ENS | sort \
> $expt/deseq2/counts.txt
cat /dev/null > $expt/deseq2/counts-header.txt
echo -e "\tcondition" > $expt/deseq2/samples.txt
for sample in `ls tophat/*.count | sed -e 's/tophat\///' | sed -e 's/.count$//' | grep ^$expt`
do
condition=`echo $sample | sed -e 's/[0-9]*$//'`
echo -ne "\t$sample" >> $expt/deseq2/counts-header.txt
join -j1 $expt/deseq2/counts.txt <(sort tophat/$sample.count) \
> $expt/deseq2/counts.tmp
mv $expt/deseq2/counts.tmp $expt/deseq2/counts.txt
echo -e "$sample\t$condition" >> $expt/deseq2/samples.txt
done
cat $expt/deseq2/counts-header.txt <(echo) $expt/deseq2/counts.txt \
| sed -e 's/ /\t/g' > $expt/deseq2/counts.tmp
rm $expt/deseq2/counts-header.txt
mv $expt/deseq2/counts.tmp $expt/deseq2/counts.txt
done
Check that deseq2/samples.txt reflects your experiment correctly. The conditions may need editing because they are derived by simply removing numeric suffixes from the sample names. Also if you have a multifactor design then you may need to add more columns (and adjust the design below). If your conditions aren't "mutant" and "sibling" then you'll need to adjust the contrasts below.
If you have multiple experiments (see above), then cd into each experiment's directory before running R.
R-3.1.2
# Load libraries
library(DESeq2)
library(RColorBrewer)
library(gplots)
# Load data
countData <- read.table( "deseq2/counts.txt", header=TRUE, row.names=1 )
samples <- read.table( "deseq2/samples.txt", header=TRUE, row.names=1 )
# Check counts and samples match
print(data.frame(names(countData), rownames(samples)))
# If you have multiple factors then the design below will need to be changed
dds <- DESeqDataSetFromMatrix(countData, samples, design = ~ condition)
# Run DESeq2
dds <- DESeq(dds)
# Write out normalisation size factors
write.table(sizeFactors(dds), file="deseq2/size-factors.txt", col.names=FALSE, quote=FALSE, sep="\t")
# Write out normalised counts
write.table(counts(dds, normalized=TRUE), file="deseq2/normalised-counts.txt", col.names=FALSE, quote=FALSE, sep="\t")
# Write out results for specified pair of conditions
# This section should be repeated if you want to look at other pairs of conditions
res <- results(dds, contrast=c("condition", "beating_stage_4_1hyp", "beating_stage_4_norm"))
out <- data.frame(pvalue=res$pvalue, padj=res$padj, log2fc=res$log2FoldChange, row.names=rownames(res))
write.table(out, file="deseq2/beating_stage_4_norm_vs_beating_stage_4_1hyp.txt", col.names=FALSE, row.names=TRUE, quote=FALSE, sep="\t")
# Plot some QC graphs
pdf("deseq2/qc.pdf")
plotMA(dds)
rld <- rlogTransformation(dds, blind=TRUE)
select <- order(rowMeans(counts(dds, normalized=TRUE)), decreasing=TRUE)[1:30]
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
heatmap.2(assay(rld)[select,], col = hmcol, Rowv = FALSE, Colv = FALSE, scale="none", dendrogram="none", trace="none", margin=c(10, 6))
heatmap.2(as.matrix(dist(t(assay(rld)))), trace="none", col = rev(hmcol), margin=c(13, 13))
print(plotPCA(rld, intgroup=c("condition")))
plotDispEsts(dds)
dev.off()
# Exit R
quit()
If you have multiple experiments (see above), then cd into each experiment's directory first.
for file in `ls deseq2/*_vs_*.txt`
do
newfile=`echo $file | sed -e 's/deseq2\///' | sed -e 's/txt$/tsv/'`
newsigfile=`echo $file | sed -e 's/deseq2\///' | sed -e 's/txt$/sig.tsv/'`
echo -ne "Gene ID\tpval\tadjp\tlog2fc\tChr\tStart\tEnd\tStrand\tBiotype\tName\tDescription" > $newfile
echo -ne "Gene ID\tpval\tadjp\tlog2fc\tChr\tStart\tEnd\tStrand\tBiotype\tName\tDescription" > $newsigfile
for sample in `head -1 deseq2/counts.txt`
do
echo -ne "\t$sample count" >> $newfile
done
for sample in `head -1 deseq2/counts.txt`
do
echo -ne "\t$sample normalised count" >> $newfile
done
echo >> $newfile
join -j1 -t $'\t' $file annotation.txt \
| join -j1 -t $'\t' - deseq2/counts.txt \
| join -j1 -t $'\t' - deseq2/normalised-counts.txt \
| sort -gk3,3 -gk2,2 \
>> $newfile
awk '{ if ($2 != "NA" && $3 != "NA") print $0 }' $newfile > $newfile.tmp
awk '{ if ($2 != "NA" && $3 == "NA") print $0 }' $newfile >> $newfile.tmp
awk '{ if ($2 == "NA" && $3 == "NA") print $0 }' $newfile >> $newfile.tmp
mv $newfile.tmp $newfile
head -1 $newfile > $newsigfile
awk '{ if ($3 < 0.05) print $0 }' $newfile >> $newsigfile
done
If you have multiple experiments (see above), then cd into each experiment's directory first.
for file in `ls *.tsv | grep -v sig.tsv$`
do
dir=`echo $file | sed -e 's/tsv$/go/'`
bsub -o $dir/go.o -e $dir/go.e \
-R'select[mem>2000] rusage[mem=2000]' -M2000 \
perl -I/software/team31/packages/topgo-wrapper/lib \
/software/team31/packages/topgo-wrapper/script/run_topgo.pl \
--dir $dir \
--input_file $file \
--gene_field 1 \
--p_value_field 3 \
--go_terms_file /software/team31/packages/topgo-wrapper/data/homo_sapiens_e78_go.txt \
--header 1
done
Check jobs ran OK:
ls *.tsv | grep -v sig.tsv$ | wc -l && \
grep -l 'Successfully completed' *.go/go.o | wc -l && \
grep -l 'Exited' *.go/go.o | wc -l