Last active
June 1, 2022 13:28
-
-
Save slowkow/395448fe1094c8ea4c7d to your computer and use it in GitHub Desktop.
Get symbols (or any other ids) corresponding to Ensembl gene ids
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 1. Install biomart. | |
source("http://bioconductor.org/biocLite.R") | |
biocLite("biomaRt") | |
# 2. Load biomart. | |
library(biomaRt) | |
# 3. Get symbols for Ensembl IDs | |
ensembl_ids = c( | |
"ENSG00000243485", "ENSG00000237613", "ENSG00000186092", "ENSG00000238009" | |
) | |
mart = useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl") | |
dat = getBM( | |
values = ensembl_ids, | |
filters = c("ensembl_gene_id"), | |
attributes = c("ensembl_gene_id", "external_gene_name", "description"), | |
mart = mart | |
) | |
write.table( | |
dat, | |
file = "biomart.tsv", | |
quote = FALSE, | |
sep = "\t", | |
row.names = FALSE | |
) | |
# 4. Example: change the row names for a dataframe from ensembl ids to gene names. | |
# Read the data. | |
d <- read.delim("counts.tsv.gz", row.names = 1) | |
# Query biomart. | |
library(biomaRt) | |
ensembl_ids <- rownames(d) | |
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl") | |
x <- getBM( | |
mart = mart, | |
values = ensembl_ids, | |
filter = c("ensembl_gene_id"), | |
attributes = c("ensembl_gene_id", "external_gene_name") | |
) | |
# Create a dictionary from ensembl id to gene name. | |
ens_to_gene <- as.character(x$external_gene_name) | |
names(ens_to_gene) <- as.character(x$ensembl_gene_id) | |
# Some ensembl ids do not map to any gene names. | |
gene_names <- ens_to_gene[rownames(d)] | |
gene_names[is.na(gene_names)] <- rownames(d)[is.na(gene_names)] | |
# Multiple ensembl ids map to a single gene name, so average those ensembl ids. | |
library(reshape2) | |
library(data.table) | |
mean_by <- function(dat, xs) { | |
dat <- data.table(dat) | |
dat$agg_var <- xs | |
dat <- melt(dat, id.vars = "agg_var") | |
dat <- dcast.data.table( | |
dat, agg_var ~ variable, value.var = "value", | |
fun.aggregate = mean, na.rm = TRUE | |
) | |
rownames(dat) <- dat$agg_var | |
dat[ , agg_var := NULL] | |
dat | |
} | |
# Write the output. | |
dd <- mean_by(d, gene_names) | |
write.table(dd, "out.tsv", quote = FALSE, sep = "\t", col.names = NA) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment