Created
January 14, 2019 22:20
-
-
Save slavailn/6b7000b6feacf35a8c4c07a167b0cd5b to your computer and use it in GitHub Desktop.
Process ChIP-seq peaks, based on multiple Bioconductor packages, but centered on ChipSeeker and ClusterProfiler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(GenomicRanges) | |
library(ChIPpeakAnno) | |
library(genomation) | |
library(biomaRt) | |
library(ChIPseeker) | |
library(org.Mm.eg.db) | |
library(ReactomePA) | |
library(clusterProfiler) | |
setwd("<project_dir>") | |
list.files() | |
# Get MACS2 peaks | |
setwd("macs2_peaks/") | |
# Read in MACS2 peaks for each of the conditions | |
# Select and save peaks with FDR < 0.05 | |
ct <- read.table("Ct/Ct_peaks.blacklist.filtered.bed", sep = "\t", header = F, stringsAsFactors = F) | |
names(ct) <- c("chr", "start", "end", "peak_id", "score") | |
ct <- toGRanges(ct) | |
il4 <- read.table("IL4/IL4_13_peaks.blacklist.filtered.bed", sep = "\t", header = F, stringsAsFactors = F) | |
names(il4) <- c("chr", "start", "end", "peak_id", "score") | |
il4 <- toGRanges(il4) | |
lps <- read.table("LPS_IL4/LPS_IL4_13_peaks.blacklist.filtered.bed", sep = "\t", header = F, stringsAsFactors = F) | |
names(lps) <- c("chr", "start", "end", "peak_id", "score") | |
lps <- toGRanges(lps) | |
### | |
# Read in SISSRs peaks for each of the conditions | |
setwd("../sissrs_peaks/") | |
list.files() | |
ct_sissr <- read.table("Ct.bed", sep = "\t", header = F, stringsAsFactors = F) | |
names(ct_sissr) <- c("chr", "start", "end", "pvalue") | |
ct_sissr <- toGRanges(ct_sissr) | |
il4_sissr <- read.table("IL4_13.bed", sep = "\t", header = F, stringsAsFactors = F) | |
names(il4_sissr) <- c("chr", "start", "end", "pvalue") | |
il4_sissr <- toGRanges(il4_sissr) | |
lps_sissr <- read.table("LPS_IL4_13.bed", sep = "\t", header = F, stringsAsFactors = F) | |
names(lps_sissr) <- c("chr", "start", "end", "pvalue") | |
lps_sissr <- toGRanges(lps_sissr) | |
# Check overlaps between MACS2 and SISSRs | |
tiff("Ct_MACS_SISSRs_overlap.tiff", width = 500, height = 500) | |
makeVennDiagram(list(ct, ct_sissr), NameOfPeaks = c("MACS14", "SISSRs"), main="Control") | |
dev.off() | |
tiff("IL4_MACS_SISSRs_overlap.tiff", width = 500, height = 500) | |
makeVennDiagram(list(il4, il4_sissr), NameOfPeaks = c("MACS14", "SISSRs"), main="IL4") | |
dev.off() | |
tiff("LPS_MACS_SISSRs_overlap.tiff", width = 500, height = 500) | |
makeVennDiagram(list(lps, lps_sissr), NameOfPeaks = c("MACS14", "SISSRs"), main="LPS_IL4") | |
dev.off() | |
# What are the overlaps between experimental groups when MACS14 are considered | |
tiff("MACS14_peaks_overlap.tiff", width = 500, height = 500) | |
makeVennDiagram(list(ct, il4, lps), NameOfPeaks = c("Control", "IL4", "LPS_IL4"), main="MACS14 peaks") | |
dev.off() | |
tiff("MACS14_CT_IL4_peaks_overlap.tiff", width = 500, height = 500) | |
makeVennDiagram(list(ct, il4), NameOfPeaks = c("Control", "IL4"), main="MACS14 peaks") | |
dev.off() | |
tiff("MACS14_CT_LPS_peaks_overlap.tiff", width = 500, height = 500) | |
makeVennDiagram(list(ct, lps), NameOfPeaks = c("Control", "LPS_IL4"), main="MACS14 peaks") | |
dev.off() | |
tiff("MACS14_IL4_LPS_peaks_overlap.tiff", width = 500, height = 500) | |
makeVennDiagram(list(il4, lps), NameOfPeaks = c("IL4", "LPS_IL4"), main="MACS14 peaks") | |
dev.off() | |
###################################################################################################### | |
## Annotate peaks with ChiPpeakAnno | |
ensembl <- useMart("ensembl") | |
listDatasets(ensembl)[grep("Mouse", listDatasets(ensembl)$description),] | |
ensembl <- useDataset("mmusculus_gene_ensembl", mart=ensembl) | |
ct_annot <- annotatePeakInBatch(myPeakList = ct, | |
mart=ensembl, | |
featureType = "TSS", | |
output = "nearestLocation", | |
PeakLocForDistance = "middle", | |
select = "first", | |
ignore.strand = T) | |
# Add gene ids | |
ct_annot <- addGeneIDs(ct_annot, mart = ensembl, feature_id_type = "ensembl_gene_id", | |
IDs2Add = c("entrezgene", "mgi_symbol", "description")) | |
head(ct_annot) | |
write.table(ct_annot, file="Ct_annotated_peaks.txt", sep = "\t", col.names = T, row.names = F, quote = F) | |
il4_annot <- annotatePeakInBatch(myPeakList = il4, | |
mart=ensembl, | |
featureType = "TSS", | |
output = "nearestLocation", | |
PeakLocForDistance = "middle", | |
select = "first", | |
ignore.strand = T) | |
# Add gene ids | |
il4_annot <- addGeneIDs(il4_annot, mart = ensembl, feature_id_type = "ensembl_gene_id", | |
IDs2Add = c("entrezgene", "mgi_symbol", "description")) | |
head(il4_annot) | |
write.table(il4_annot, file="il4_annotated_peaks.txt", sep = "\t", col.names = T, row.names = F, quote = F) | |
lps_annot <- annotatePeakInBatch(myPeakList = lps, | |
mart=ensembl, | |
featureType = "TSS", | |
output = "nearestLocation", | |
PeakLocForDistance = "middle", | |
select = "first", | |
ignore.strand = T) | |
# Add gene ids | |
lps_annot <- addGeneIDs(lps_annot, mart = ensembl, feature_id_type = "ensembl_gene_id", | |
IDs2Add = c("entrezgene", "mgi_symbol", "description")) | |
head(lps_annot) | |
write.table(lps_annot, file="lps_annotated_peaks.txt", sep = "\t", col.names = T, row.names = F, quote = F) | |
################################################################################################################## | |
## Classify peaks by relation to genomic features (promoter, intronic, intergenic etc.) | |
## Aanalyze distribution of dictances to TSS from different datasets | |
## This will be done using ChipSeeker | |
## Create annotation data base from biomaRt | |
txdb <- makeTxDbFromBiomart(biomart="ENSEMBL_MART_ENSEMBL", | |
dataset="mmusculus_gene_ensembl", | |
transcript_ids=NULL, | |
circ_seqs=DEFAULT_CIRC_SEQS, | |
filter=NULL, | |
id_prefix="ensembl_", | |
host="www.ensembl.org", | |
port=80, | |
taxonomyId=NA, | |
miRBaseBuild=NA) | |
txdb | |
## Read peak files filetered from blacklisted regions | |
setwd("~/Projects/ChIP_CREB_BO_17AUG1018/macs2_peaks/") | |
list.files() | |
ct_path <- "Ct/Ct_peaks.blacklist.filtered.bed" | |
il4_path <- "IL4/IL4_13_peaks.blacklist.filtered.bed" | |
lps_path <- "LPS_IL4/LPS_IL4_13_peaks.blacklist.filtered.bed" | |
files <- list(ct_path, il4_path, lps_path) | |
files | |
# Create a list of peaks | |
peaks <- lapply(files, function(x) readPeakFile(x)) | |
peaks | |
names(peaks) <- c("Ct", "IL4", "LPS_IL4") | |
names(peaks) | |
# Plot peak coverage over the chromosomes | |
pdf("Ct_peaks_over_chromosomed.pdf") | |
covplot(peaks[["Ct"]], weightCol="V5", title = "ChIP Peaks over Chromosomes in Control") | |
dev.off() | |
pdf("IL4_peaks_over_chromosomed.pdf") | |
covplot(peaks[["IL4"]], weightCol="V5", title = "ChIP Peaks over Chromosomes in IL4") | |
dev.off() | |
pdf("LPS_IL4_peaks_over_chromosomed.pdf") | |
covplot(peaks[["LPS_IL4"]], weightCol="V5", title = "ChIP Peaks over Chromosomes in LPS_IL4") | |
dev.off() | |
########################################################################## | |
# Profile ChIP peaks binding to TSS regions | |
promoter <- getPromoters(TxDb=txdb, upstream=3000, downstream=3000) | |
tagMatrix_CT <- getTagMatrix(peaks[["Ct"]], windows=promoter) | |
tagMatrix_IL4 <- getTagMatrix(peaks[["IL4"]], windows=promoter) | |
tagMatrix_LPS_IL4 <- getTagMatrix(peaks[["LPS_IL4"]], windows=promoter) | |
tiff("peak_profile_TSS.tiff", width = 600, height = 500) | |
par(mfrow=c(1,3)) | |
tagHeatmap(tagMatrix_CT, xlim=c(-3000, 3000), color="red", title = "Control") | |
tagHeatmap(tagMatrix_IL4, xlim=c(-3000, 3000), color="red", title = "IL4") | |
tagHeatmap(tagMatrix_LPS_IL4, xlim=c(-3000, 3000), color="red", title = "LPS_IL4") | |
dev.off() | |
## Histogram of average distances to TSS, all samples overlayed | |
tiff("mean_distance_to_TSS_profile.tiff", width = 500, height = 500) | |
plotAvgProf(list(Control=tagMatrix_CT, IL4=tagMatrix_IL4, LPS_IL4=tagMatrix_LPS_IL4), xlim=c(-3000, 3000), | |
xlab="Genomic Region (5'->3')", ylab = "Read Count Frequency") | |
dev.off() | |
########################################################################### | |
# Peak annotation (by default promoter region is -3kb to +3kb.) | |
# Default region priority for the annotation (this can be changed genomicAnnotationPriority argument) | |
# Promoter | |
# 5’ UTR | |
# 3’ UTR | |
# Exon | |
# Intron | |
# Downstream | |
# Intergenic | |
# Compare multiple datasets with one command | |
names(files) <- c("Control", "IL4", "LPS_IL4") | |
peakAnnoList <- lapply(files, function(x) annotatePeak(x, TxDb=txdb, tssRegion=c(-3000, 3000), | |
annoDb="org.Mm.eg.db")) | |
peakAnnoList | |
# Plot annotation bars simultaneusly for multiple datasets | |
tiff("Genomic_Features_overlaps.tiff", width = 500, height = 500) | |
plotAnnoBar(peakAnnoList) | |
dev.off() | |
# Compare distances to TSS between peak lists | |
# Compare distribution of distances to TSS between datasets | |
tiff("distance_to_TSS_comparison.tiff", width = 500, height = 500) | |
plotDistToTSS(peakAnnoList) | |
dev.off() | |
# Save ChIPseeker annotations | |
write.table(as.data.frame(peakAnnoList[[1]]), file="Control_peaks_ChIPseeker_annotated.txt", | |
sep = "\t", quote = F, col.names = T, row.names = F) | |
write.table(as.data.frame(peakAnnoList[[2]]), file="IL4_peaks_ChIPseeker_annotated.txt", | |
sep = "\t", quote = F, col.names = T, row.names = F) | |
write.table(as.data.frame(peakAnnoList[[3]]), file="IL4_LPS_peaks_ChIPseeker_annotated.txt", | |
sep = "\t", quote = F, col.names = T, row.names = F) | |
############################################################################################### | |
# Calculate GO and KEGG enrichment using clusterProfiler | |
ct_annot <- as.data.frame(peakAnnoList[[1]]) | |
il4_annot <- as.data.frame(peakAnnoList[[2]]) | |
lps_annot <- as.data.frame(peakAnnoList[[3]]) | |
# Keep only peaks located within 3000 bp up- or down-stream to TSS | |
ct_annot <- ct_annot[abs(ct_annot$distanceToTSS) < 3000,] | |
il4_annot <- il4_annot[abs(il4_annot$distanceToTSS) < 3000,] | |
lps_annot <- lps_annot[abs(lps_annot$distanceToTSS) < 3000,] | |
# Calculate GO enrichment | |
ct_GO <-enrichGO(gene = ct_annot$SYMBOL, | |
OrgDb = org.Mm.eg.db, | |
keyType = "SYMBOL", | |
ont = "BP", | |
pAdjustMethod = "BH", | |
pvalueCutoff = 0.05, | |
qvalueCutoff = 0.1) | |
write.table(ct_GO@result, file = "Control_GO.txt", sep = "\t", col.names = T, row.names = F, quote = F) | |
tiff("Control_GO_dotplot.tiff", width = 600, height = 500) | |
dotplot(ct_GO, showCategory = 20) | |
dev.off() | |
il4_GO <-enrichGO(gene = il4_annot$SYMBOL, | |
OrgDb = org.Mm.eg.db, | |
keyType = "SYMBOL", | |
ont = "BP", | |
pAdjustMethod = "BH", | |
pvalueCutoff = 0.05, | |
qvalueCutoff = 0.1) | |
write.table(il4_GO@result, file = "IL4_GO.txt", sep = "\t", col.names = T, row.names = F, quote = F) | |
tiff("IL4_GO_dotplot.tiff", width = 600, height = 500) | |
dotplot(il4_GO, showCategory = 20) | |
dev.off() | |
lps_GO <-enrichGO(gene = lps_annot$SYMBOL, | |
OrgDb = org.Mm.eg.db, | |
keyType = "SYMBOL", | |
ont = "BP", | |
pAdjustMethod = "BH", | |
pvalueCutoff = 0.05, | |
qvalueCutoff = 0.1) | |
write.table(lps_GO@result, file = "LPS_IL4_GO.txt", sep = "\t", col.names = T, row.names = F, quote = F) | |
tiff("LPS_IL4_GO_dotplot.tiff", width = 600, height = 500) | |
dotplot(lps_GO, showCategory = 20) | |
dev.off() | |
######################################################################################################## | |
## Calculate KEGG enrichment | |
setwd("../peak_analysis/") | |
ct_KEGG <- enrichKEGG(gene = ct_annot$ENTREZID, | |
organism = 'mmu', | |
pvalueCutoff = 0.05) | |
write.table(ct_KEGG@result, file = "CT_KEGG.txt", sep = "\t", col.names = T, row.names = F, quote = F) | |
tiff("CT_KEGG_dotplot.tiff", width = 600, height = 500) | |
dotplot(ct_KEGG, showCategory = 20) | |
dev.off() | |
il4_KEGG <- enrichKEGG(gene = il4_annot$ENTREZID, | |
organism = 'mmu', | |
pvalueCutoff = 0.05) | |
write.table(il4_KEGG@result, file = "IL4_KEGG.txt", sep = "\t", col.names = T, row.names = F, quote = F) | |
tiff("IL4_KEGG_dotplot.tiff", width = 600, height = 500) | |
dotplot(il4_KEGG, showCategory = 20) | |
dev.off() | |
lps_KEGG <- enrichKEGG(gene = lps_annot$ENTREZID, | |
organism = 'mmu', | |
pvalueCutoff = 0.05) | |
write.table(lps_KEGG@result, file = "LPS_IL4_KEGG.txt", sep = "\t", col.names = T, row.names = F, quote = F) | |
tiff("LPS_IL4_KEGG_dotplot.tiff", width = 600, height = 500) | |
dotplot(lps_KEGG, showCategory = 20) | |
dev.off() | |
############################################################################################################# | |
# Calculate enrichment for reactomePA | |
reactome_ct <- enrichPathway(ct_annot$ENTREZID, organism="mouse", pvalueCutoff = 0.1, pAdjustMethod = "BH", minGSSize = 10, | |
maxGSSize = 500) | |
write.table(reactome_ct@result, file="CT_Reactome_PA.txt", sep = "\t", col.names = T, row.names = F, | |
quote = F) | |
tiff("CT_Reactome.tiff", width = 600, height = 500) | |
dotplot(reactome_ct, showCategory = 20) | |
dev.off() | |
reactome_il4 <- enrichPathway(il4_annot$ENTREZID, organism="mouse", pvalueCutoff = 0.1, pAdjustMethod = "BH", minGSSize = 10, | |
maxGSSize = 500) | |
write.table(reactome_il4@result, file="IL4_Reactome_PA.txt", sep = "\t", col.names = T, row.names = F, | |
quote = F) | |
tiff("IL4_Reactome.tiff", width = 600, height = 500) | |
dotplot(reactome_il4, showCategory = 20) | |
dev.off() | |
reactome_lps <- enrichPathway(lps_annot$ENTREZID, organism="mouse", pvalueCutoff = 0.1, pAdjustMethod = "BH", minGSSize = 10, | |
maxGSSize = 500) | |
write.table(reactome_lps@result, file="LPS_IL4_Reactome_PA.txt", sep = "\t", col.names = T, row.names = F, | |
quote = F) | |
tiff("LPS_IL4_Reactome.tiff", width = 600, height = 500) | |
dotplot(reactome_lps, showCategory = 20) | |
dev.off() | |
################################################################################################################# | |
# Compare biological themes (GO and pathway enrichment among peak lists) | |
# Create list of entrez IDs for each of the experiments (promoter associated --> abs(distanceToTSS < 3000)) | |
id_list <- list(Control=ct_annot$ENTREZID, IL4=il4_annot$ENTREZID, LPS_IL4=lps_annot$ENTREZID) | |
compKEGG <- compareCluster(geneCluster = id_list, | |
fun = "enrichKEGG", | |
pvalueCutoff = 0.05, | |
pAdjustMethod = "BH", | |
organism = "mmu") | |
tiff("compare_enrichment_KEGG.tiff", width = 600, height = 500) | |
dotplot(compKEGG, showCategory = 20, title = "KEGG Pathway Enrichment Comparison") | |
dev.off() | |
compGO <- compareCluster(geneClusters = id_list, | |
fun = "enrichGO", | |
qvalueCutoff = 0.05, | |
pAdjustMethod = "BH", | |
OrgDb = org.Mm.eg.db, | |
ont = "BP") | |
tiff("compare_enrichment_GO.tiff", width = 1200, height = 600) | |
dotplot(compGO, showCategory = 20, title = "GO Enrichment Comparison") | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment