-
-
Save felipealbrecht/e17984adbfbce65c24ef3e50130df583 to your computer and use it in GitHub Desktop.
Download all BLUEPRINT gene expression data and format as numeric matrix
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load dependencies | |
# install DeepBlueR from bioconductor | |
# http://bioconductor.org/packages/release/bioc/html/DeepBlueR.html | |
library(DeepBlueR) | |
library(dplyr) | |
library(tidyr) | |
# List all BLUEPRINT samples | |
blueprint_samples <- deepblue_list_samples( | |
extra_metadata = list("source" = "BLUEPRINT Epigenome")) | |
# Extract their ids | |
blueprint_samples_ids <- deepblue_extract_ids(blueprint_samples) | |
# Select gene expression data. We assign gene names using Gencode 22 | |
gene_exprs_query <- deepblue_select_expressions(expression_type="gene", sample_ids = | |
blueprint_samples_ids, gene_model = "gencode v22") | |
# We request the data and define the output format | |
request = deepblue_get_regions(query_id = gene_exprs_query, | |
"@GENE_ID(gencode v22),FPKM,@BIOSOURCE,@SAMPLE_ID") | |
# We download the data | |
gene_regions <- deepblue_download_request_data(request) | |
# We retain a table mapping sample ids to bisources | |
sample_names <- dplyr::select(gene_regions, `@BIOSOURCE`, `@SAMPLE_ID`) %>% | |
dplyr::distinct() | |
# We filter out duplicated gene entries | |
genes_one_sample <- dplyr::filter(gene_regions, `@SAMPLE_ID` == "s10678") | |
duplicated_genes <- genes_one_sample[ | |
which(duplicated(genes_one_sample$`@GENE_ID(gencode v22)`)), | |
"@GENE_ID(gencode v22)"] | |
# We convert the gene expression from a list to a data frame and subsequently... | |
genes_matrix = dplyr::filter(gene_regions, | |
!(`@GENE_ID(gencode v22)` %in% duplicated_genes)) %>% | |
dplyr::select(-`@BIOSOURCE`) %>% | |
tidyr::spread(key = `@SAMPLE_ID`, value = FPKM) | |
# ...to a numeric matrix | |
genes <- genes_matrix[,1] | |
genes_matrix <- data.matrix(genes_matrix[,-1]) | |
rownames(genes_matrix) <- genes | |
### OUTPUT | |
### genes_matrix : The gene expression matrix for all 276 BLUEPRINT samples | |
### sample_names : A mapping table from sample id to cell type / biosource |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment