Skip to content

Instantly share code, notes, and snippets.

@slavailn
Created January 14, 2019 22:20
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save slavailn/6b7000b6feacf35a8c4c07a167b0cd5b to your computer and use it in GitHub Desktop.
Save slavailn/6b7000b6feacf35a8c4c07a167b0cd5b to your computer and use it in GitHub Desktop.
Process ChIP-seq peaks, based on multiple Bioconductor packages, but centered on ChipSeeker and ClusterProfiler
library(GenomicRanges)
library(ChIPpeakAnno)
library(genomation)
library(biomaRt)
library(ChIPseeker)
library(org.Mm.eg.db)
library(ReactomePA)
library(clusterProfiler)
setwd("<project_dir>")
list.files()
# Get MACS2 peaks
setwd("macs2_peaks/")
# Read in MACS2 peaks for each of the conditions
# Select and save peaks with FDR < 0.05
ct <- read.table("Ct/Ct_peaks.blacklist.filtered.bed", sep = "\t", header = F, stringsAsFactors = F)
names(ct) <- c("chr", "start", "end", "peak_id", "score")
ct <- toGRanges(ct)
il4 <- read.table("IL4/IL4_13_peaks.blacklist.filtered.bed", sep = "\t", header = F, stringsAsFactors = F)
names(il4) <- c("chr", "start", "end", "peak_id", "score")
il4 <- toGRanges(il4)
lps <- read.table("LPS_IL4/LPS_IL4_13_peaks.blacklist.filtered.bed", sep = "\t", header = F, stringsAsFactors = F)
names(lps) <- c("chr", "start", "end", "peak_id", "score")
lps <- toGRanges(lps)
###
# Read in SISSRs peaks for each of the conditions
setwd("../sissrs_peaks/")
list.files()
ct_sissr <- read.table("Ct.bed", sep = "\t", header = F, stringsAsFactors = F)
names(ct_sissr) <- c("chr", "start", "end", "pvalue")
ct_sissr <- toGRanges(ct_sissr)
il4_sissr <- read.table("IL4_13.bed", sep = "\t", header = F, stringsAsFactors = F)
names(il4_sissr) <- c("chr", "start", "end", "pvalue")
il4_sissr <- toGRanges(il4_sissr)
lps_sissr <- read.table("LPS_IL4_13.bed", sep = "\t", header = F, stringsAsFactors = F)
names(lps_sissr) <- c("chr", "start", "end", "pvalue")
lps_sissr <- toGRanges(lps_sissr)
# Check overlaps between MACS2 and SISSRs
tiff("Ct_MACS_SISSRs_overlap.tiff", width = 500, height = 500)
makeVennDiagram(list(ct, ct_sissr), NameOfPeaks = c("MACS14", "SISSRs"), main="Control")
dev.off()
tiff("IL4_MACS_SISSRs_overlap.tiff", width = 500, height = 500)
makeVennDiagram(list(il4, il4_sissr), NameOfPeaks = c("MACS14", "SISSRs"), main="IL4")
dev.off()
tiff("LPS_MACS_SISSRs_overlap.tiff", width = 500, height = 500)
makeVennDiagram(list(lps, lps_sissr), NameOfPeaks = c("MACS14", "SISSRs"), main="LPS_IL4")
dev.off()
# What are the overlaps between experimental groups when MACS14 are considered
tiff("MACS14_peaks_overlap.tiff", width = 500, height = 500)
makeVennDiagram(list(ct, il4, lps), NameOfPeaks = c("Control", "IL4", "LPS_IL4"), main="MACS14 peaks")
dev.off()
tiff("MACS14_CT_IL4_peaks_overlap.tiff", width = 500, height = 500)
makeVennDiagram(list(ct, il4), NameOfPeaks = c("Control", "IL4"), main="MACS14 peaks")
dev.off()
tiff("MACS14_CT_LPS_peaks_overlap.tiff", width = 500, height = 500)
makeVennDiagram(list(ct, lps), NameOfPeaks = c("Control", "LPS_IL4"), main="MACS14 peaks")
dev.off()
tiff("MACS14_IL4_LPS_peaks_overlap.tiff", width = 500, height = 500)
makeVennDiagram(list(il4, lps), NameOfPeaks = c("IL4", "LPS_IL4"), main="MACS14 peaks")
dev.off()
######################################################################################################
## Annotate peaks with ChiPpeakAnno
ensembl <- useMart("ensembl")
listDatasets(ensembl)[grep("Mouse", listDatasets(ensembl)$description),]
ensembl <- useDataset("mmusculus_gene_ensembl", mart=ensembl)
ct_annot <- annotatePeakInBatch(myPeakList = ct,
mart=ensembl,
featureType = "TSS",
output = "nearestLocation",
PeakLocForDistance = "middle",
select = "first",
ignore.strand = T)
# Add gene ids
ct_annot <- addGeneIDs(ct_annot, mart = ensembl, feature_id_type = "ensembl_gene_id",
IDs2Add = c("entrezgene", "mgi_symbol", "description"))
head(ct_annot)
write.table(ct_annot, file="Ct_annotated_peaks.txt", sep = "\t", col.names = T, row.names = F, quote = F)
il4_annot <- annotatePeakInBatch(myPeakList = il4,
mart=ensembl,
featureType = "TSS",
output = "nearestLocation",
PeakLocForDistance = "middle",
select = "first",
ignore.strand = T)
# Add gene ids
il4_annot <- addGeneIDs(il4_annot, mart = ensembl, feature_id_type = "ensembl_gene_id",
IDs2Add = c("entrezgene", "mgi_symbol", "description"))
head(il4_annot)
write.table(il4_annot, file="il4_annotated_peaks.txt", sep = "\t", col.names = T, row.names = F, quote = F)
lps_annot <- annotatePeakInBatch(myPeakList = lps,
mart=ensembl,
featureType = "TSS",
output = "nearestLocation",
PeakLocForDistance = "middle",
select = "first",
ignore.strand = T)
# Add gene ids
lps_annot <- addGeneIDs(lps_annot, mart = ensembl, feature_id_type = "ensembl_gene_id",
IDs2Add = c("entrezgene", "mgi_symbol", "description"))
head(lps_annot)
write.table(lps_annot, file="lps_annotated_peaks.txt", sep = "\t", col.names = T, row.names = F, quote = F)
##################################################################################################################
## Classify peaks by relation to genomic features (promoter, intronic, intergenic etc.)
## Aanalyze distribution of dictances to TSS from different datasets
## This will be done using ChipSeeker
## Create annotation data base from biomaRt
txdb <- makeTxDbFromBiomart(biomart="ENSEMBL_MART_ENSEMBL",
dataset="mmusculus_gene_ensembl",
transcript_ids=NULL,
circ_seqs=DEFAULT_CIRC_SEQS,
filter=NULL,
id_prefix="ensembl_",
host="www.ensembl.org",
port=80,
taxonomyId=NA,
miRBaseBuild=NA)
txdb
## Read peak files filetered from blacklisted regions
setwd("~/Projects/ChIP_CREB_BO_17AUG1018/macs2_peaks/")
list.files()
ct_path <- "Ct/Ct_peaks.blacklist.filtered.bed"
il4_path <- "IL4/IL4_13_peaks.blacklist.filtered.bed"
lps_path <- "LPS_IL4/LPS_IL4_13_peaks.blacklist.filtered.bed"
files <- list(ct_path, il4_path, lps_path)
files
# Create a list of peaks
peaks <- lapply(files, function(x) readPeakFile(x))
peaks
names(peaks) <- c("Ct", "IL4", "LPS_IL4")
names(peaks)
# Plot peak coverage over the chromosomes
pdf("Ct_peaks_over_chromosomed.pdf")
covplot(peaks[["Ct"]], weightCol="V5", title = "ChIP Peaks over Chromosomes in Control")
dev.off()
pdf("IL4_peaks_over_chromosomed.pdf")
covplot(peaks[["IL4"]], weightCol="V5", title = "ChIP Peaks over Chromosomes in IL4")
dev.off()
pdf("LPS_IL4_peaks_over_chromosomed.pdf")
covplot(peaks[["LPS_IL4"]], weightCol="V5", title = "ChIP Peaks over Chromosomes in LPS_IL4")
dev.off()
##########################################################################
# Profile ChIP peaks binding to TSS regions
promoter <- getPromoters(TxDb=txdb, upstream=3000, downstream=3000)
tagMatrix_CT <- getTagMatrix(peaks[["Ct"]], windows=promoter)
tagMatrix_IL4 <- getTagMatrix(peaks[["IL4"]], windows=promoter)
tagMatrix_LPS_IL4 <- getTagMatrix(peaks[["LPS_IL4"]], windows=promoter)
tiff("peak_profile_TSS.tiff", width = 600, height = 500)
par(mfrow=c(1,3))
tagHeatmap(tagMatrix_CT, xlim=c(-3000, 3000), color="red", title = "Control")
tagHeatmap(tagMatrix_IL4, xlim=c(-3000, 3000), color="red", title = "IL4")
tagHeatmap(tagMatrix_LPS_IL4, xlim=c(-3000, 3000), color="red", title = "LPS_IL4")
dev.off()
## Histogram of average distances to TSS, all samples overlayed
tiff("mean_distance_to_TSS_profile.tiff", width = 500, height = 500)
plotAvgProf(list(Control=tagMatrix_CT, IL4=tagMatrix_IL4, LPS_IL4=tagMatrix_LPS_IL4), xlim=c(-3000, 3000),
xlab="Genomic Region (5'->3')", ylab = "Read Count Frequency")
dev.off()
###########################################################################
# Peak annotation (by default promoter region is -3kb to +3kb.)
# Default region priority for the annotation (this can be changed genomicAnnotationPriority argument)
# Promoter
# 5’ UTR
# 3’ UTR
# Exon
# Intron
# Downstream
# Intergenic
# Compare multiple datasets with one command
names(files) <- c("Control", "IL4", "LPS_IL4")
peakAnnoList <- lapply(files, function(x) annotatePeak(x, TxDb=txdb, tssRegion=c(-3000, 3000),
annoDb="org.Mm.eg.db"))
peakAnnoList
# Plot annotation bars simultaneusly for multiple datasets
tiff("Genomic_Features_overlaps.tiff", width = 500, height = 500)
plotAnnoBar(peakAnnoList)
dev.off()
# Compare distances to TSS between peak lists
# Compare distribution of distances to TSS between datasets
tiff("distance_to_TSS_comparison.tiff", width = 500, height = 500)
plotDistToTSS(peakAnnoList)
dev.off()
# Save ChIPseeker annotations
write.table(as.data.frame(peakAnnoList[[1]]), file="Control_peaks_ChIPseeker_annotated.txt",
sep = "\t", quote = F, col.names = T, row.names = F)
write.table(as.data.frame(peakAnnoList[[2]]), file="IL4_peaks_ChIPseeker_annotated.txt",
sep = "\t", quote = F, col.names = T, row.names = F)
write.table(as.data.frame(peakAnnoList[[3]]), file="IL4_LPS_peaks_ChIPseeker_annotated.txt",
sep = "\t", quote = F, col.names = T, row.names = F)
###############################################################################################
# Calculate GO and KEGG enrichment using clusterProfiler
ct_annot <- as.data.frame(peakAnnoList[[1]])
il4_annot <- as.data.frame(peakAnnoList[[2]])
lps_annot <- as.data.frame(peakAnnoList[[3]])
# Keep only peaks located within 3000 bp up- or down-stream to TSS
ct_annot <- ct_annot[abs(ct_annot$distanceToTSS) < 3000,]
il4_annot <- il4_annot[abs(il4_annot$distanceToTSS) < 3000,]
lps_annot <- lps_annot[abs(lps_annot$distanceToTSS) < 3000,]
# Calculate GO enrichment
ct_GO <-enrichGO(gene = ct_annot$SYMBOL,
OrgDb = org.Mm.eg.db,
keyType = "SYMBOL",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.1)
write.table(ct_GO@result, file = "Control_GO.txt", sep = "\t", col.names = T, row.names = F, quote = F)
tiff("Control_GO_dotplot.tiff", width = 600, height = 500)
dotplot(ct_GO, showCategory = 20)
dev.off()
il4_GO <-enrichGO(gene = il4_annot$SYMBOL,
OrgDb = org.Mm.eg.db,
keyType = "SYMBOL",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.1)
write.table(il4_GO@result, file = "IL4_GO.txt", sep = "\t", col.names = T, row.names = F, quote = F)
tiff("IL4_GO_dotplot.tiff", width = 600, height = 500)
dotplot(il4_GO, showCategory = 20)
dev.off()
lps_GO <-enrichGO(gene = lps_annot$SYMBOL,
OrgDb = org.Mm.eg.db,
keyType = "SYMBOL",
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.1)
write.table(lps_GO@result, file = "LPS_IL4_GO.txt", sep = "\t", col.names = T, row.names = F, quote = F)
tiff("LPS_IL4_GO_dotplot.tiff", width = 600, height = 500)
dotplot(lps_GO, showCategory = 20)
dev.off()
########################################################################################################
## Calculate KEGG enrichment
setwd("../peak_analysis/")
ct_KEGG <- enrichKEGG(gene = ct_annot$ENTREZID,
organism = 'mmu',
pvalueCutoff = 0.05)
write.table(ct_KEGG@result, file = "CT_KEGG.txt", sep = "\t", col.names = T, row.names = F, quote = F)
tiff("CT_KEGG_dotplot.tiff", width = 600, height = 500)
dotplot(ct_KEGG, showCategory = 20)
dev.off()
il4_KEGG <- enrichKEGG(gene = il4_annot$ENTREZID,
organism = 'mmu',
pvalueCutoff = 0.05)
write.table(il4_KEGG@result, file = "IL4_KEGG.txt", sep = "\t", col.names = T, row.names = F, quote = F)
tiff("IL4_KEGG_dotplot.tiff", width = 600, height = 500)
dotplot(il4_KEGG, showCategory = 20)
dev.off()
lps_KEGG <- enrichKEGG(gene = lps_annot$ENTREZID,
organism = 'mmu',
pvalueCutoff = 0.05)
write.table(lps_KEGG@result, file = "LPS_IL4_KEGG.txt", sep = "\t", col.names = T, row.names = F, quote = F)
tiff("LPS_IL4_KEGG_dotplot.tiff", width = 600, height = 500)
dotplot(lps_KEGG, showCategory = 20)
dev.off()
#############################################################################################################
# Calculate enrichment for reactomePA
reactome_ct <- enrichPathway(ct_annot$ENTREZID, organism="mouse", pvalueCutoff = 0.1, pAdjustMethod = "BH", minGSSize = 10,
maxGSSize = 500)
write.table(reactome_ct@result, file="CT_Reactome_PA.txt", sep = "\t", col.names = T, row.names = F,
quote = F)
tiff("CT_Reactome.tiff", width = 600, height = 500)
dotplot(reactome_ct, showCategory = 20)
dev.off()
reactome_il4 <- enrichPathway(il4_annot$ENTREZID, organism="mouse", pvalueCutoff = 0.1, pAdjustMethod = "BH", minGSSize = 10,
maxGSSize = 500)
write.table(reactome_il4@result, file="IL4_Reactome_PA.txt", sep = "\t", col.names = T, row.names = F,
quote = F)
tiff("IL4_Reactome.tiff", width = 600, height = 500)
dotplot(reactome_il4, showCategory = 20)
dev.off()
reactome_lps <- enrichPathway(lps_annot$ENTREZID, organism="mouse", pvalueCutoff = 0.1, pAdjustMethod = "BH", minGSSize = 10,
maxGSSize = 500)
write.table(reactome_lps@result, file="LPS_IL4_Reactome_PA.txt", sep = "\t", col.names = T, row.names = F,
quote = F)
tiff("LPS_IL4_Reactome.tiff", width = 600, height = 500)
dotplot(reactome_lps, showCategory = 20)
dev.off()
#################################################################################################################
# Compare biological themes (GO and pathway enrichment among peak lists)
# Create list of entrez IDs for each of the experiments (promoter associated --> abs(distanceToTSS < 3000))
id_list <- list(Control=ct_annot$ENTREZID, IL4=il4_annot$ENTREZID, LPS_IL4=lps_annot$ENTREZID)
compKEGG <- compareCluster(geneCluster = id_list,
fun = "enrichKEGG",
pvalueCutoff = 0.05,
pAdjustMethod = "BH",
organism = "mmu")
tiff("compare_enrichment_KEGG.tiff", width = 600, height = 500)
dotplot(compKEGG, showCategory = 20, title = "KEGG Pathway Enrichment Comparison")
dev.off()
compGO <- compareCluster(geneClusters = id_list,
fun = "enrichGO",
qvalueCutoff = 0.05,
pAdjustMethod = "BH",
OrgDb = org.Mm.eg.db,
ont = "BP")
tiff("compare_enrichment_GO.tiff", width = 1200, height = 600)
dotplot(compGO, showCategory = 20, title = "GO Enrichment Comparison")
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment