Skip to content

Instantly share code, notes, and snippets.

@Balharbi
Forked from slavailn/jaccard_genes_cluster.R
Created August 26, 2020 14:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Balharbi/b7dcd2d097d2cf9586f3c742545bf4b4 to your computer and use it in GitHub Desktop.
Save Balharbi/b7dcd2d097d2cf9586f3c742545bf4b4 to your computer and use it in GitHub Desktop.
Cluster sets of genes based on Jaccard distance (mostly stolen from Biostars post)
# test Jaccard
# Create a list gene sets
char1 <- c("gene1", "gene2", "gene5", "gene9", "gene10")
char2 <- c("gene2", "gene3", "gene5", "gene7", "gene10")
char3 <- c("gene7", "gene9", "gene10", "gene11", "gene12", "gene1")
lst <- list(char1, char2, char3)
# Function to calculate Jaccard distance
siml <- function(x,y) {
1-length(intersect(lst[[x]],lst[[y]]))/length(union(lst[[x]],lst[[y]]))
}
# get a data frame of all factor combinations
z <- expand.grid(x=1:length(lst), y=1:length(lst))
# calculate Jaccard distances between each pair of gene sets
result <- mapply(siml,z$x,z$y)
# Turn into matrix
dim(result) <- c(length(lst),length(lst))
# Convert to distance
dists <- as.dist(result)
# Cluster a plot a dendrogram
plot(hclust(dists))
############ FUNCTION FOR GOstats on KEGG only ####################################
## The GOstats output has to have a last column with comma separated gene names
clusterJakkard <- function(res_table, plot_name)
{
# res_table="Neuroblastoma_DOWN_SIG_GOstats_KEGG.txt" this has to be a file in my usual format
# plot_name="Neuroblastoma_DOWN_SIG_KEGG_clusters.tiff"
keggtab <- read.table(res_table, sep = "\t", header = T, quote = "\"",
stringsAsFactors = F)
keggtab$KEGGID <- paste("0", keggtab$KEGGID, sep = "")
keggtab <- data.frame(KEGGID=keggtab$KEGGID, Term=keggtab$Term, Genes=keggtab$Genes)
keggtab$KEGGID <- paste(keggtab$KEGGID, keggtab$Term, sep="::")
keggtab <- keggtab[,-2]
# Get the list of vectors
gene.list <- strsplit(as.character(keggtab$Genes), ";")
names(gene.list) <- keggtab$KEGGID
lst <- gene.list
siml <- function(x,y) {
1-length(intersect(lst[[x]],lst[[y]]))/length(union(lst[[x]],lst[[y]]))
}
z <- expand.grid(x=1:length(lst), y=1:length(lst))
result <- mapply(siml,z$x,z$y)
dim(result) <- c(length(lst),length(lst))
colnames(result) <- names(lst)
rownames(result) <- names(lst)
dists <- as.dist(result)
hc <- hclust(dists, method="ward.D2")
tiff(plot_name, width=500, height = 800)
par(cex = 0.8, mar=c(2,2,2,15))
plot(as.dendrogram(hc), horiz=T, type = "triangle")
dev.off()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment