coralzhang/CodeAvailability1.sh

## CodeAvailability1.R
rm(list=ls())
setwd("/path/to/directory")
suppressMessages(library(DESeq2))
packageVersion("DESeq2")

### DESeq 2 ###
dat1 = read.table("outtxtnamefromfeatureCounts.txt",header=TRUE)
outname = "fc-pval-wgtf.csv"
dim(dat1)
dat = floor(dat1[,c(7:12)])
cond.a = factor(rep(c("PR","WR"), each=3),levels=c("WR","PR")) ### Need to check the order of files
coldata = data.frame(cond.a=cond.a)
dds <- DESeqDataSetFromMatrix(dat, DataFrame(cond.a), ~ cond.a)
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
deseq.obj = DESeq(dds)
res = results(deseq.obj)
logfc = (res$log2FoldChange)
pval = res$padj
datout=cbind(logfc,pval)
sampleDists <- dist(t(assay(rld)))

#### MA plot from DESeq2 here

pdf("MA.pdf",height=4,width=4)
plotMA(res, ylim=c(-2,2),main="MA plot")
dev.off()

pdf("heatmap.pdf",height=4,width=4)
library("RColorBrewer")
library("pheatmap")
rld <- rlog(dds, blind=FALSE)
sampleDists <- dist(t(assay(rld)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(rld$cond.a,rep(1:3,2))
colnames(sampleDistMatrix) <- paste(rld$cond.a,rep(1:3,2))
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors)
dev.off()

pdf("PCA.pdf",height=4,width=4)
plotPCA(rld, intgroup=c("cond.a"))
dev.off()

########### RNA Seq & qRT-PCR correlation
fc = 2^logfc
gene.id1=c("ENSDARG00000044199","ENSDARG00000042529","ENSDARG00000044861","ENSDARG00000097008","ENSDARG00000017274","ENSDARG00000045677","ENSDARG00000002193")
gene.id = paste0("gene:",gene.id1)
gene.id.ind = vector(length=length(gene.id))
for (i in 1:length(gene.id)){
  gene.id.ind[i]=which(as.character(dat1[,1])==gene.id[i])
}

rnafc = fc[gene.id.ind]
gname.fai= c("gnat1", "rho", "gnat2", "uv", "blue", "green", "red")
qPCR.data.fai <- c(0.694142669, 0.659446831, 0.178241305,0.179138536,
                0.448949859, 0.185187476, 0.64875601)
indswitch=c(1,3,7,6,5,4,2)
gname.fai[indswitch]
qPCR.data <- qPCR.data.fai[indswitch]

cor.test(rnafc,qPCR.data)
#gene.name=c("gnat1","gnat2","opn1lw2 (red opsin)","opn1mw1 (green opsin)","opn1sw2 (blue opsin)","opn1sw1 (uv opsin)","rhodopsin")
gene.name=c("gnat1","gnat2","red","green","blue","uv","rhodopsin")
min1=min(c(rnafc,qPCR.data));max1=max(c(rnafc,qPCR.data))+0.05
col1=c("black","black","red","green","blue","purple","black")

pdf("validatation.pdf",width=5,height=5)
plot(rnafc,qPCR.data, xlim=c(min1,max1),ylim=c(min1,max1),col=col1,
     xlab="Fold Change by RNA seq",ylab="Fold Change by RT-qPCR")
text(rnafc,qPCR.data+c(0.03,0.04,rep(0.03,5)), labels=as.character(gene.name),col=col1, srt=45,font=2) # text
abline(0,1,lty=3)
dev.off()

library(vioplot)
dat.fpkm1 = read.table("cufnorm_out/genes.fpkm_table_from_Cuffnorm",sep="\t",header=TRUE)
### cufnorm_out is the name of the cuffnorm output.
dat.fpkm1[1,]
summary(dat.fpkm[,-1])
dat.fpkm = dat.fpkm1[,c(2,1,3:6)+1]
dat.fpkm[1,]
dim(dat.fpkm)

labvec = paste0(rep(c("PR","WR"),each=3),rep(1:3,2)) ## Make sure that the order of output matches.

par(mfrow=c(2,3))
for (i in 1:6){vioplot(log(dat.fpkm[dat.fpkm[,i]!=0,i]),ylim=c(0,15),names=c(labvec[i]),main=perct[i])}
colnames(dat.fpkm)=labvec

jpeg("pair1PR.jpeg",width=400,height=400,quality=20)
pairs(log(dat.fpkm[,1:3],2))
dev.off()

jpeg("pair2WR.jpeg",width=400,height=400,quality=20)
pairs(log(dat.fpkm[,4:6],2))
dev.off()


## CodeAvailability1.sh
#!/bin/bash


############## Trimmomatic

java -jar /path/to/Trimmomatic/trimmomatic-0.36.jar PE -phred33 $ingzfile1 $ingzfile2 $outfile1 $unmapoutfile1 $outfile2 $unmapoutfile2 ILLUMINACLIP:/path/to/Trimmomatic/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 > $logname2 2>&1

############## FastQC
/path/to//FastQC/fastqc $infqfile1 -f fastq --nogroup -o $outdirect

############## STAR: Generate Genome index
/path/to/STAR/bin/Linux_x86_64/STAR --runThreadN 6 --runMode genomeGenerate --genomeDir /file/to/store/index --genomeFastaFiles /path/to/genome/fastaq/Danio_rerio.GRCz10.dna.toplevel.fa --sjdbGTFfile /path/to/genome/gtf/Danio_rerio.GRCz10.89.gtf

############## STAR: Alignment
/path/to/STAR/bin/Linux_x86_64/STAR --runThreadN 16 --genomeDir $gnomedir --readFilesIn $infqgzR1 $infqgzR2 --readFilesCommand gunzip -c --outSAMstrandField intronMotif --limitBAMsortRAM 41000000000 --outFileNamePrefix $outname --outSAMtype BAM SortedByCoordinate --outBAMsortingThreadN 8 --outFilterMultimapNmax 50 --winAnchorMultimapNmax 50 --alignIntronMax 10000 --alignEndsType EndToEnd --outFilterScoreMinOverLread 0.3 --outFilterMatchNminOverLread 0.3

############## Counting: with intron: not fixed yet
less Danio_rerio.GRCz10.89gene.gff3 | grep -v KN1 | grep -v MT | grep protein_coding > Danio_rerio.GRCz10.89gene2.gff3

############## featureCount: Read counting
/path/to/subread-1.5.2-Linux-x86_64/bin/featureCounts -p -B -t gene -g ID -C -T 6 -a /path/to/genome/Danio_rerio.GRCz10.89gene2.gff3 -o $outtxtname $arrayofbamfiles

############## Resort to R for DESeq2 analysis

############## Run Cufflinks/Cuffdiff
/path/to/cufflinks/cufflinks -G /path/to/gtf/Danio_rerio.GRCz10.89.gtf $inbam -o $outdir

/path/to/cufflinks/cuffdiff -p 6 -o $outdir --emit-count-tables -b /path/to/genome/Danio_rerio.GRCz10.dna.toplevel.fa  /path/to/gtf/Danio_rerio.GRCz10.89.gtf \
$mutantbam1,$mutantbam2,$mutantbam3 \
$wildbam1,$wildbam2,$wildbam3  > cufdiff_out.log 2>&1

/path/to/cufflinks/cuffnorm -p 6 -o $outdir --emit-count-tables -b /path/to/genome/Danio_rerio.GRCz10.dna.toplevel.fa  /path/to/gtf/Danio_rerio.GRCz10.89.gtf \
$mutantbam1,$mutantbam2,$mutantbam3 \
$wildbam1,$wildbam2,$wildbam3  > cufdiff_out.log 2>&1
	rm(list=ls())
	setwd("/path/to/directory")
	suppressMessages(library(DESeq2))
	packageVersion("DESeq2")

	### DESeq 2 ###
	dat1 = read.table("outtxtnamefromfeatureCounts.txt",header=TRUE)
	outname = "fc-pval-wgtf.csv"
	dim(dat1)
	dat = floor(dat1[,c(7:12)])
	cond.a = factor(rep(c("PR","WR"), each=3),levels=c("WR","PR")) ### Need to check the order of files
	coldata = data.frame(cond.a=cond.a)
	dds <- DESeqDataSetFromMatrix(dat, DataFrame(cond.a), ~ cond.a)
	dds <- estimateSizeFactors(dds)
	dds <- estimateDispersions(dds)
	deseq.obj = DESeq(dds)
	res = results(deseq.obj)
	logfc = (res$log2FoldChange)
	pval = res$padj
	datout=cbind(logfc,pval)
	sampleDists <- dist(t(assay(rld)))

	#### MA plot from DESeq2 here

	pdf("MA.pdf",height=4,width=4)
	plotMA(res, ylim=c(-2,2),main="MA plot")
	dev.off()

	pdf("heatmap.pdf",height=4,width=4)
	library("RColorBrewer")
	library("pheatmap")
	rld <- rlog(dds, blind=FALSE)
	sampleDists <- dist(t(assay(rld)))
	sampleDistMatrix <- as.matrix(sampleDists)
	rownames(sampleDistMatrix) <- paste(rld$cond.a,rep(1:3,2))
	colnames(sampleDistMatrix) <- paste(rld$cond.a,rep(1:3,2))
	colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
	pheatmap(sampleDistMatrix,
	clustering_distance_rows=sampleDists,
	clustering_distance_cols=sampleDists,
	col=colors)
	dev.off()

	pdf("PCA.pdf",height=4,width=4)
	plotPCA(rld, intgroup=c("cond.a"))
	dev.off()

	########### RNA Seq & qRT-PCR correlation
	fc = 2^logfc
	gene.id1=c("ENSDARG00000044199","ENSDARG00000042529","ENSDARG00000044861","ENSDARG00000097008","ENSDARG00000017274","ENSDARG00000045677","ENSDARG00000002193")
	gene.id = paste0("gene:",gene.id1)
	gene.id.ind = vector(length=length(gene.id))
	for (i in 1:length(gene.id)){
	gene.id.ind[i]=which(as.character(dat1[,1])==gene.id[i])
	}

	rnafc = fc[gene.id.ind]
	gname.fai= c("gnat1", "rho", "gnat2", "uv", "blue", "green", "red")
	qPCR.data.fai <- c(0.694142669, 0.659446831, 0.178241305,0.179138536,
	0.448949859, 0.185187476, 0.64875601)
	indswitch=c(1,3,7,6,5,4,2)
	gname.fai[indswitch]
	qPCR.data <- qPCR.data.fai[indswitch]

	cor.test(rnafc,qPCR.data)
	#gene.name=c("gnat1","gnat2","opn1lw2 (red opsin)","opn1mw1 (green opsin)","opn1sw2 (blue opsin)","opn1sw1 (uv opsin)","rhodopsin")
	gene.name=c("gnat1","gnat2","red","green","blue","uv","rhodopsin")
	min1=min(c(rnafc,qPCR.data));max1=max(c(rnafc,qPCR.data))+0.05
	col1=c("black","black","red","green","blue","purple","black")

	pdf("validatation.pdf",width=5,height=5)
	plot(rnafc,qPCR.data, xlim=c(min1,max1),ylim=c(min1,max1),col=col1,
	xlab="Fold Change by RNA seq",ylab="Fold Change by RT-qPCR")
	text(rnafc,qPCR.data+c(0.03,0.04,rep(0.03,5)), labels=as.character(gene.name),col=col1, srt=45,font=2) # text
	abline(0,1,lty=3)
	dev.off()

	library(vioplot)
	dat.fpkm1 = read.table("cufnorm_out/genes.fpkm_table_from_Cuffnorm",sep="\t",header=TRUE)
	### cufnorm_out is the name of the cuffnorm output.
	dat.fpkm1[1,]
	summary(dat.fpkm[,-1])
	dat.fpkm = dat.fpkm1[,c(2,1,3:6)+1]
	dat.fpkm[1,]
	dim(dat.fpkm)

	labvec = paste0(rep(c("PR","WR"),each=3),rep(1:3,2)) ## Make sure that the order of output matches.

	par(mfrow=c(2,3))
	for (i in 1:6){vioplot(log(dat.fpkm[dat.fpkm[,i]!=0,i]),ylim=c(0,15),names=c(labvec[i]),main=perct[i])}
	colnames(dat.fpkm)=labvec

	jpeg("pair1PR.jpeg",width=400,height=400,quality=20)
	pairs(log(dat.fpkm[,1:3],2))
	dev.off()

	jpeg("pair2WR.jpeg",width=400,height=400,quality=20)
	pairs(log(dat.fpkm[,4:6],2))
	dev.off()
	#!/bin/bash


	############## Trimmomatic

	java -jar /path/to/Trimmomatic/trimmomatic-0.36.jar PE -phred33 $ingzfile1 $ingzfile2 $outfile1 $unmapoutfile1 $outfile2 $unmapoutfile2 ILLUMINACLIP:/path/to/Trimmomatic/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 > $logname2 2>&1

	############## FastQC
	/path/to//FastQC/fastqc $infqfile1 -f fastq --nogroup -o $outdirect

	############## STAR: Generate Genome index
	/path/to/STAR/bin/Linux_x86_64/STAR --runThreadN 6 --runMode genomeGenerate --genomeDir /file/to/store/index --genomeFastaFiles /path/to/genome/fastaq/Danio_rerio.GRCz10.dna.toplevel.fa --sjdbGTFfile /path/to/genome/gtf/Danio_rerio.GRCz10.89.gtf

	############## STAR: Alignment
	/path/to/STAR/bin/Linux_x86_64/STAR --runThreadN 16 --genomeDir $gnomedir --readFilesIn $infqgzR1 $infqgzR2 --readFilesCommand gunzip -c --outSAMstrandField intronMotif --limitBAMsortRAM 41000000000 --outFileNamePrefix $outname --outSAMtype BAM SortedByCoordinate --outBAMsortingThreadN 8 --outFilterMultimapNmax 50 --winAnchorMultimapNmax 50 --alignIntronMax 10000 --alignEndsType EndToEnd --outFilterScoreMinOverLread 0.3 --outFilterMatchNminOverLread 0.3

	############## Counting: with intron: not fixed yet
	less Danio_rerio.GRCz10.89gene.gff3 \| grep -v KN1 \| grep -v MT \| grep protein_coding > Danio_rerio.GRCz10.89gene2.gff3

	############## featureCount: Read counting
	/path/to/subread-1.5.2-Linux-x86_64/bin/featureCounts -p -B -t gene -g ID -C -T 6 -a /path/to/genome/Danio_rerio.GRCz10.89gene2.gff3 -o $outtxtname $arrayofbamfiles

	############## Resort to R for DESeq2 analysis

	############## Run Cufflinks/Cuffdiff
	/path/to/cufflinks/cufflinks -G /path/to/gtf/Danio_rerio.GRCz10.89.gtf $inbam -o $outdir

	/path/to/cufflinks/cuffdiff -p 6 -o $outdir --emit-count-tables -b /path/to/genome/Danio_rerio.GRCz10.dna.toplevel.fa /path/to/gtf/Danio_rerio.GRCz10.89.gtf \
	$mutantbam1,$mutantbam2,$mutantbam3 \
	$wildbam1,$wildbam2,$wildbam3 > cufdiff_out.log 2>&1

	/path/to/cufflinks/cuffnorm -p 6 -o $outdir --emit-count-tables -b /path/to/genome/Danio_rerio.GRCz10.dna.toplevel.fa /path/to/gtf/Danio_rerio.GRCz10.89.gtf \
	$mutantbam1,$mutantbam2,$mutantbam3 \
	$wildbam1,$wildbam2,$wildbam3 > cufdiff_out.log 2>&1