iansealy/rnaseq-intergenic.md

## rnaseq-intergenic.md

      
    Raw
  

              rnaseq-intergenic.md
            
          
    RNA-Seq (including intergenic assembly)

These notes assume you are using bash.
Firstly, login to farm3-login and create a new directory in one of your Lustre directories and change into it. For example:
mkdir /lustre/scratch110/sanger/is1/kdm2aa
cd /lustre/scratch110/sanger/is1/kdm2aa

Secondly, set an environment variable representing the run/lanes. For example:
run_lanes="14260_1 14260_2 14262_1 14262_2"

Get CRAM files from iRODS

Authenticate to iRODS:
kinit

Download CRAM files and their metadata:
for run_lane in $run_lanes
do
mkdir $run_lane
run=`echo $run_lane | sed -e 's/_.*//'`
lane=`echo $run_lane | sed -e 's/.*_//'`
/software/irods/icommands/bin/imeta qu -z seq -d id_run = $run and lane = $lane and target = 1 and type = cram \
| grep : | awk '{ print $2 }' | paste - - -d/ \
| xargs -ixxx /software/irods/icommands/bin/iget -K xxx $run_lane
chmod 664 $run_lane/*
find $run_lane | grep -E '#(168|888)\.' | xargs rm
for cram in $(find $run_lane | grep cram$ | sed -e 's/.*\///' | sed -e 's/\.cram$//')
do
/software/irods/icommands/bin/imeta ls -d /seq/$run/$cram.cram > $run_lane/$cram.imeta
done
done

Convert CRAM files to BAM

ls */*.cram | sed -e 's/.cram$//' | xargs -ixxx \
bsub -o xxx.cram2bam.o -e xxx.cram2bam.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
"/software/team31/bin/samtools-1.2 view -b xxx.cram > xxx.bam"

Check jobs ran OK:
ls */*.cram | wc -l && \
grep -l 'Successfully completed' */*.cram2bam.o | wc -l && \
grep -l 'Exited' */*.cram2bam.o | wc -l

Delete original CRAM files

for run_lane in $run_lanes
do
rm $run_lane/*.cram
done

Convert BAM to FASTQ

Get list of samples:
for run_lane in $run_lanes
do
grep -A1 sample$ $run_lane/*.imeta | grep value | sed -e 's/.imeta-value: /.bam:/' \
> $run_lane.samples.txt
done

Extract FASTQ:
for run_lane in $run_lanes
do
mkdir $run_lane.fastq
for line in `cat $run_lane.samples.txt`
do
bam=`echo $line | sed -e 's/:.*//'`
sample=`echo $line | sed -e 's/.*://'`
bsub -o $run_lane.fastq/$sample.o -e $run_lane.fastq/$sample.e \
-R'select[mem>500] rusage[mem=500]' -M500 \
/software/hpag/bin/bamtofastq \
F=$run_lane.fastq/$sample.1.fastq \
F2=$run_lane.fastq/$sample.2.fastq \
filename=$bam
done
done

Check jobs ran OK:
ls */*.bam | wc -l && \
grep -l 'Successfully completed' *.fastq/*.o | wc -l && \
grep -l 'Exited' *.fastq/*.o | wc -l

Confirm correct number of reads extracted:
echo -e "Sample\tCRAM\tFASTQ" && \
for run_lane in $run_lanes
do
for line in `grep -A1 -E 'total_reads|sample$' $run_lane/*.imeta | grep imeta | paste - - | sort -V \
| awk '{ print $4 }' | paste - - | awk '{ print $1 ":" $2 }'`
do
sample=`echo $line | sed -e 's/:.*//'`
count1=`echo $line | sed -e 's/.*://'`
count2=$((`cat $run_lane.fastq/$sample.?.fastq | wc -l` / 4))
echo -e "$sample\t$count1\t$count2"
done
done

Delete intermediate BAM files

for run_lane in $run_lanes
do
rm $run_lane/*.bam
done

Run FastQC on original FASTQ files

for run_lane in $run_lanes
do
bsub -n16 -o $run_lane.fastq/fastqc.o -e $run_lane.fastq/fastqc.e \
-R'span[hosts=1] select[mem>4000] rusage[mem=4000]' -M4000 \
perl /software/team31/packages/FastQC/fastqc --threads 16 $run_lane.fastq/*.fastq
done

Check jobs ran OK:
ls -d *.fastq | wc -l && \
grep -l 'Successfully completed' *.fastq/fastqc.o | wc -l && \
grep -l 'Exited' *.fastq/fastqc.o | wc -l

Check HTML files to ensure reads don't need trimming (of adapters or for quality).
Merge FASTQ files across lanes

mkdir fastq
for sample in `sed -e 's/.*://' *.samples.txt | sort -u`
do
for read in 1 2
do
bsub -o fastq/$sample.$read.merge.o -e fastq/$sample.$read.merge.e \
"cat `ls *.fastq/$sample.$read.fastq | sort | tr '\n' ' '` > fastq/$sample.$read.fastq"
done
done

Check jobs ran OK:
echo $((`sed -e 's/.*://' *.samples.txt | sort -u | wc -l` * 2)) && \
grep -l 'Successfully completed' fastq/*.o | wc -l && \
grep -l 'Exited' fastq/*.o | wc -l

Delete intermediate FASTQ files

for run_lane in $run_lanes
do
rm -rf $run_lane.fastq
done

Download reference genome and transcriptome

mkdir ref
cd ref
wget ftp://ftp.ensembl.org/pub/release-79/gtf/danio_rerio/Danio_rerio.Zv9.79.gtf.gz
gzip -d Danio_rerio.Zv9.79.gtf.gz
wget ftp://ftp.ensembl.org/pub/release-79/fasta/danio_rerio/dna/Danio_rerio.Zv9.dna_sm.toplevel.fa.gz
gzip -d Danio_rerio.Zv9.dna_sm.toplevel.fa.gz
cd ..

Index reference genome

cd ref
bsub -o bowtie2-build.o -e bowtie2-build.e \
-R'select[mem>10000] rusage[mem=10000]' -M10000 \
"/software/team31/bin/bowtie2-build Danio_rerio.Zv9.dna_sm.toplevel.fa Danio_rerio.Zv9.dna_sm.toplevel \
> bowtie2-build.log"
bsub -o faidx.o -e faidx.e samtools faidx Danio_rerio.Zv9.dna_sm.toplevel.fa
bsub -o CreateSequenceDictionary.o -e CreateSequenceDictionary.e \
-R'select[mem>4000] rusage[mem=4000]' -M4000 \
/software/java/bin/java -jar /software/team31/packages/picard-tools/CreateSequenceDictionary.jar \
R=Danio_rerio.Zv9.dna_sm.toplevel.fa \
O=Danio_rerio.Zv9.dna_sm.toplevel.fa.dict
cd ..

Check jobs ran OK:
echo 3 && \
grep -l 'Successfully completed' ref/*.o | wc -l && \
grep -l 'Exited' ref/*.o | wc -l

Align to genome using TopHat

mkdir tophat
for sample in `ls fastq/*.1.fastq | sed -e 's/\.1.fastq//' | sed -e 's/^fastq\///'`
do
mkdir tophat/$sample
bsub \
-o tophat/$sample/tophat.o -e tophat/$sample/tophat.e \
-q long \
-R'select[mem>6000] rusage[mem=6000]' -M6000 \
"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/software/team31/packages/boost-1.56.0/lib \
/software/team31/packages/tophat-2.0.13/bin/tophat2 \
--output-dir tophat/$sample \
--library-type fr-firststrand \
ref/Danio_rerio.Zv9.dna_sm.toplevel \
fastq/$sample.1.fastq \
fastq/$sample.2.fastq"
done

Check jobs ran OK:
ls fastq/*.1.fastq | wc -l && \
grep -l 'Successfully completed' tophat/*/tophat.o | wc -l && \
grep -l 'Exited' tophat/*/tophat.o | wc -l

Delete FASTQ files:

rm fastq/*.fastq

Index TopHat BAM files

ls tophat/*/accepted_hits.bam | sed -e 's/.bam$//' | xargs -ixxx \
bsub -o xxx.index.o -e xxx.index.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
/software/team31/bin/samtools index xxx.bam

Check jobs ran OK:
ls tophat/*/accepted_hits.bam | wc -l && \
grep -l 'Successfully completed' tophat/*/accepted_hits.index.o | wc -l && \
grep -l 'Exited' tophat/*/accepted_hits.index.o | wc -l

Assemble transcripts using Cufflinks

mkdir cufflinks
for sample in `ls tophat/*/accepted_hits.bam | sed -e 's/\/accepted_hits.bam//' | sed -e 's/tophat\///'`
do
mkdir -p cufflinks/$sample
bsub -o cufflinks/$sample/cufflinks.o -e cufflinks/$sample/cufflinks.e \
-R'select[mem>4000] rusage[mem=4000]' -M4000 \
"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/software/team31/packages/boost-1.53.0/lib \
/software/team31/packages/cufflinks-2.2.1/bin/cufflinks \
--output-dir cufflinks/$sample \
--library-type fr-firststrand \
tophat/$sample/accepted_hits.bam"
done

Check jobs ran OK:
ls tophat/*/accepted_hits.bam | wc -l && \
grep -l 'Successfully completed' cufflinks/*/cufflinks.o | wc -l && \
grep -l 'Exited' cufflinks/*/cufflinks.o | wc -l

Merge assemblies using Cuffmerge

mkdir cufflinks/cuffmerge
ls cufflinks/*/transcripts.gtf | sort > cufflinks/cuffmerge_assembly_list.txt
bsub -o cufflinks/cuffmerge.o -e cufflinks/cuffmerge.e \
-R'select[mem>4000] rusage[mem=4000]' -M4000 \
"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/software/team31/packages/boost-1.53.0/lib \
/software/team31/packages/cufflinks-2.2.1/bin/cuffmerge \
-o cufflinks/cuffmerge \
-g ref/Danio_rerio.Zv9.79.gtf \
-s ref/Danio_rerio.Zv9.dna_sm.toplevel.fa \
cufflinks/cuffmerge_assembly_list.txt"

Check jobs ran OK:
echo 1 && \
grep -l 'Successfully completed' cufflinks/cuffmerge.o | wc -l && \
grep -l 'Exited' cufflinks/cuffmerge.o | wc -l

Identify stranded unannotated genes

grep 'class_code "u"' cufflinks/cuffmerge/merged.gtf | awk '{ if ($7 != ".") print $0 }' \
> cufflinks/cuffmerge/novel.gtf

Merge novel exons with annotated genes and remove retained intron transcripts:
cat ref/Danio_rerio.Zv9.79.gtf cufflinks/cuffmerge/novel.gtf \
| grep -v retained_intron \
> cufflinks/cuffmerge/Danio_rerio.Zv9.79.novel.noretained.gtf

Count reads with HTSeq for TopHat BAM files

for base in `ls tophat/*/accepted_hits.bam | sed -e 's/\/accepted_hits.bam$//'`
do
bsub -o $base.htseq-count.o -e $base.htseq-count.e \
-n2 -R'span[hosts=1] select[mem>2000] rusage[mem=2000]' -M2000 \
"/software/team31/bin/samtools-0.1.19 sort -n -o $base/accepted_hits.bam $base.sorted \
| /software/team31/bin/samtools-0.1.19 view - \
| /software/team31/python/bin/htseq-count --stranded=reverse - \
cufflinks/cuffmerge/Danio_rerio.Zv9.79.novel.noretained.gtf \
> $base.count"
done

Check jobs ran OK:
ls tophat/*/accepted_hits.bam | wc -l && \
grep -l 'Successfully completed' tophat/*.htseq-count.o | wc -l && \
grep -l 'Exited' tophat/*.htseq-count.o | wc -l

Get annotation for each Ensembl gene

(Ensure Ensembl version matches version mapped to.)
bsub -o annotation.o -e annotation.e \
-R'select[mem>500] rusage[mem=500]' -M500 \
"perl -I/software/team31/ensembl/branch-ensembl-79/ensembl/modules \
~is1/checkouts/bio-misc/get_ensembl_gene_annotation.pl \
--species Danio_rerio > annotation.txt"

Check job ran OK:
echo 1 && \
grep -l 'Successfully completed' annotation.o | wc -l && \
grep -l 'Exited' annotation.o | wc -l

Add annotation for novel genes

perl ~is1/checkouts/bio-misc/make_rnaseq_annotation_from_cuffmerge_gtf.pl \
< cufflinks/cuffmerge/novel.gtf >> annotation.txt

Rename count files for any samples that failed QC:

For example:
for sample in zmp_ph97_H4 zmp_ph108_B9; do
mv tophat/$sample.count tophat/$sample.count.qcfail
done

Make samples file:

Ideally, make samples file in automated way. For example:
mkdir deseq2
echo -e "\tallele\tstage\tcondition" > deseq2/samples.txt
mysql -h seqw-db -P 3379 -u warehouse_ro sequencescape_warehouse -Bse "SELECT description, name FROM current_samples WHERE name LIKE 'zmp_ph%'" \
| sed -e 's/wild type/2:wt/' | sed -e 's/homozygous mutant/0:hom/' | sed -e 's/heterozygous/1:het/' | sed -e 's/,/ /g' | grep ^mRNA \
| awk '{ print $13 "\tday" $5 "\t" $8 "\t" $NF }' | sort -V \
| sed -e 's/[0-2]://' | awk '{ print $4 "\t" $1 "\t" $2 "\t" $3 }' \
| sed -e 's/hom/mut/' | sed -e 's/wt/sib/' | sed -e 's/\het/sib/' \
| while read line
do
sample=`echo $line | sed -e 's/ .*//'`
if [ -f tophat/$sample.count ]
then
  echo "$line"
fi
done >> deseq2/samples.txt

Otherwise:
mkdir deseq2
echo -e "\tcondition" > deseq2/samples.txt
for sample in `ls tophat/*.count | sed -e 's/tophat\///' | sed -e 's/.count$//' | sort -V`
do
condition=`echo $sample | sed -e 's/_[^_]*$//'`
echo -e "$sample\t$condition"
done | while read line
do
sample=`echo $line | sed -e 's/ .*//'`
if [ -f tophat/$sample.count ]
then
  echo "$line"
fi
done >> deseq2/samples.txt

Then check that deseq2/samples.txt reflects your experiment correctly. The conditions may need editing because they are derived by simply removing suffixes from the sample names. Also if you have a multifactor design then you may need to add more columns (and adjust the design below).
Merge count data

awk '{ print $1 }' `ls tophat/*.count | head -1` | grep -E 'ENS|XLOC' | sort \
> deseq2/counts.txt
cat /dev/null > deseq2/counts-header.txt
for sample in `awk -F"\t" '{ print $1 }' deseq2/samples.txt | grep -v '^$'`
do
echo -ne "\t$sample" >> deseq2/counts-header.txt
join -j1 deseq2/counts.txt <(sort tophat/$sample.count) \
> deseq2/counts.tmp
mv deseq2/counts.tmp deseq2/counts.txt
done
cat deseq2/counts-header.txt <(echo) deseq2/counts.txt \
| sed -e 's/ /\t/g' > deseq2/counts.tmp
rm deseq2/counts-header.txt
mv deseq2/counts.tmp deseq2/counts.txt

Run DESeq2

If your conditions aren't "mut" and "sib" then you'll need to adjust the contrasts below.
R-3.1.2

# Load libraries
library(DESeq2)
library(RColorBrewer)
library(gplots)

# Load data
countData <- read.table( "deseq2/counts.txt", header=TRUE, row.names=1 )
samples <- read.table( "deseq2/samples.txt", header=TRUE, row.names=1 )

# Check counts and samples match
print(data.frame(names(countData), rownames(samples)))

# If you have multiple factors then the design below will need to be changed, for example:
# dds <- DESeqDataSetFromMatrix(countData, samples, design = ~ stage + allele + condition)
dds <- DESeqDataSetFromMatrix(countData, samples, design = ~ condition)

# Run DESeq2
dds <- DESeq(dds)

# Write out normalisation size factors
write.table(sizeFactors(dds), file="deseq2/size-factors.txt", col.names=FALSE, quote=FALSE, sep="\t")

# Write out normalised counts
write.table(counts(dds, normalized=TRUE), file="deseq2/normalised-counts.txt", col.names=FALSE, quote=FALSE, sep="\t")

# Write out results for specified pair of conditions
# This section should be repeated if you want to look at other pairs of conditions
res <- results(dds, contrast=c("condition", "mut", "sib"))
out <- data.frame(pvalue=res$pvalue, padj=res$padj, log2fc=res$log2FoldChange, row.names=rownames(res))
write.table(out, file="deseq2/sib_vs_mut.txt", col.names=FALSE, row.names=TRUE, quote=FALSE, sep="\t")

# Plot some QC graphs
pdf("deseq2/qc.pdf")
plotMA(dds)
rld <- rlogTransformation(dds, blind=TRUE)
select <- order(rowMeans(counts(dds, normalized=TRUE)), decreasing=TRUE)[1:30]
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
heatmap.2(assay(rld)[select,], col = hmcol, Rowv = FALSE, Colv = FALSE, scale="none", dendrogram="none", trace="none", margin=c(10, 6))
heatmap.2(as.matrix(dist(t(assay(rld)))), trace="none", col = rev(hmcol), margin=c(13, 13))
print(plotPCA(rld, intgroup=c("condition")))
plotDispEsts(dds)
dev.off()

# Exit R
quit()

Merge results and annotation

for file in `ls deseq2/*_vs_*.txt`
do
newfile=`echo $file | sed -e 's/deseq2\///' | sed -e 's/txt$/tsv/'`
newsigfile=`echo $file | sed -e 's/deseq2\///' | sed -e 's/txt$/sig.tsv/'`
echo -ne "Gene ID\tpval\tadjp\tlog2fc\tChr\tStart\tEnd\tStrand\tBiotype\tName\tDescription" > $newfile
echo -ne "Gene ID\tpval\tadjp\tlog2fc\tChr\tStart\tEnd\tStrand\tBiotype\tName\tDescription" > $newsigfile
for sample in `head -1 deseq2/counts.txt`
do
echo -ne "\t$sample count" >> $newfile
done
for sample in `head -1 deseq2/counts.txt`
do
echo -ne "\t$sample normalised count" >> $newfile
done
echo >> $newfile
join -j1 -t $'\t' $file annotation.txt \
| join -j1 -t $'\t' - deseq2/counts.txt \
| join -j1 -t $'\t' - deseq2/normalised-counts.txt \
| sort -gk3,3 -gk2,2 \
>> $newfile
awk '{ if ($2 != "NA" && $3 != "NA") print $0 }' $newfile > $newfile.tmp
awk '{ if ($2 != "NA" && $3 == "NA") print $0 }' $newfile >> $newfile.tmp
awk '{ if ($2 == "NA" && $3 == "NA") print $0 }' $newfile >> $newfile.tmp
mv $newfile.tmp $newfile
head -1 $newfile > $newsigfile
awk '{ if ($3 < 0.05) print $0 }' $newfile >> $newsigfile
done

Add read groups to BAM files

for sample in `ls tophat/*/accepted_hits.bam | sed -e 's/\/accepted_hits.bam$//' | sed -e 's/tophat\///'`
do
mkdir -p merge/$sample
bsub -o merge/$sample.readgroup.o -e merge/$sample.readgroup.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
java -XX:ParallelGCThreads=1 -Xmx2g -jar /software/team31/packages/picard-tools/AddOrReplaceReadGroups.jar \
INPUT=tophat/$sample/accepted_hits.bam \
OUTPUT=merge/$sample.bam \
RGLB=$sample RGPL=ILLUMINA RGPU=$sample RGSM=$sample \
VALIDATION_STRINGENCY=SILENT VERBOSITY=WARNING QUIET=true \
TMP_DIR=merge/$sample
done

Check jobs ran OK:
ls tophat/*/accepted_hits.bam | wc -l && \
grep -l 'Successfully completed' merge/*.readgroup.o | wc -l && \
grep -l 'Exited' merge/*.readgroup.o | wc -l

Merge BAM files

bsub -o merge/merge.o -e merge/merge.e \
-R'select[mem>4000] rusage[mem=4000]' -M4000 \
java -XX:ParallelGCThreads=1 -Xmx4g -jar /software/team31/packages/picard-tools/MergeSamFiles.jar \
INPUT=`ls merge/*.bam | sort -V | tr '\n' ' ' | sed -e 's/ $//' | sed -e 's/ / INPUT=/g'` \
OUTPUT=merge/merged.bam \
MSD=true ASSUME_SORTED=false \
VALIDATION_STRINGENCY=SILENT VERBOSITY=WARNING QUIET=true \
TMP_DIR=merge

Check jobs ran OK:
echo 1 && \
grep -l 'Successfully completed' merge/merge.o | wc -l && \
grep -l 'Exited' merge/merge.o | wc -l

Index merged BAM file

bsub -o merge/index.o -e merge/index.e \
-R'select[mem>1000] rusage[mem=1000]' -M1000 \
/software/team31/bin/samtools index merge/merged.bam

Check jobs ran OK:
echo 1 && \
grep -l 'Successfully completed' merge/index.o | wc -l && \
grep -l 'Exited' merge/index.o | wc -l

Delete unmerged files

mv merge/merged.bam* .
rm -rf merge