Skip to content

Instantly share code, notes, and snippets.

@slowkow
Last active June 1, 2022 13:28
Show Gist options
  • Save slowkow/395448fe1094c8ea4c7d to your computer and use it in GitHub Desktop.
Save slowkow/395448fe1094c8ea4c7d to your computer and use it in GitHub Desktop.
Get symbols (or any other ids) corresponding to Ensembl gene ids
# 1. Install biomart.
source("http://bioconductor.org/biocLite.R")
biocLite("biomaRt")
# 2. Load biomart.
library(biomaRt)
# 3. Get symbols for Ensembl IDs
ensembl_ids = c(
"ENSG00000243485", "ENSG00000237613", "ENSG00000186092", "ENSG00000238009"
)
mart = useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
dat = getBM(
values = ensembl_ids,
filters = c("ensembl_gene_id"),
attributes = c("ensembl_gene_id", "external_gene_name", "description"),
mart = mart
)
write.table(
dat,
file = "biomart.tsv",
quote = FALSE,
sep = "\t",
row.names = FALSE
)
# 4. Example: change the row names for a dataframe from ensembl ids to gene names.
# Read the data.
d <- read.delim("counts.tsv.gz", row.names = 1)
# Query biomart.
library(biomaRt)
ensembl_ids <- rownames(d)
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
x <- getBM(
mart = mart,
values = ensembl_ids,
filter = c("ensembl_gene_id"),
attributes = c("ensembl_gene_id", "external_gene_name")
)
# Create a dictionary from ensembl id to gene name.
ens_to_gene <- as.character(x$external_gene_name)
names(ens_to_gene) <- as.character(x$ensembl_gene_id)
# Some ensembl ids do not map to any gene names.
gene_names <- ens_to_gene[rownames(d)]
gene_names[is.na(gene_names)] <- rownames(d)[is.na(gene_names)]
# Multiple ensembl ids map to a single gene name, so average those ensembl ids.
library(reshape2)
library(data.table)
mean_by <- function(dat, xs) {
dat <- data.table(dat)
dat$agg_var <- xs
dat <- melt(dat, id.vars = "agg_var")
dat <- dcast.data.table(
dat, agg_var ~ variable, value.var = "value",
fun.aggregate = mean, na.rm = TRUE
)
rownames(dat) <- dat$agg_var
dat[ , agg_var := NULL]
dat
}
# Write the output.
dd <- mean_by(d, gene_names)
write.table(dd, "out.tsv", quote = FALSE, sep = "\t", col.names = NA)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment