Skip to content

Instantly share code, notes, and snippets.

@mcanouil
Last active February 2, 2024 23:34
Show Gist options
  • Save mcanouil/fd32a508c25141df38354e9d73e65cc2 to your computer and use it in GitHub Desktop.
Save mcanouil/fd32a508c25141df38354e9d73e65cc2 to your computer and use it in GitHub Desktop.
# # MIT License
#
# Copyright (c) 2024 Mickaël Canouil
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
find_tissue <- function(
files,
type = "rsem",
tpm_threshold = 100,
url = "https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
) {
if (type != "rsem" || !all(grepl("genes.results$", files))) {
stop("Only RSEM gene files are supported!", call. = FALSE)
}
gtex_raw_data <- data.table::fread(url)[, -c("Description")]
gtex_markers <- data.table::melt(
data = gtex_raw_data[j = "Name" := gsub("\\..*$", "", Name)],
id.vars = "Name",
variable.name = "Tissue",
value.name = "TPM"
)[TPM >= tpm_threshold][j = Tissue := as.character(Tissue)]
gtex_markers <- gtex_markers[
i = gtex_markers[j = .SD[j = data.table::uniqueN(Tissue)], by = "Name"][V1 %in% 1:3],
j = .SD,
on = c("Name")
]
samples_tissue <- Reduce(
f = function(x, y) merge(x, y, by = "Tissue", all = TRUE),
x = lapply(
X = files,
.gtex = gtex_markers,
FUN = function(.file, .gtex) {
out <- merge(
x = .gtex,
y = data.table::fread(
file = .file,
select = c("gene_id", "TPM"),
col.names = c("Name", "sample_tpm")
),
by = "Name",
all = FALSE
)[
i = sample_tpm >= tpm_threshold & TPM >= sample_tpm,
j = .N,
by = "Tissue"
]
data.table::setnames(out, old = "N", new = gsub(".genes.results$", "", basename(.file)))
}
)
)
class(samples_tissue) <- c("tissue", class(samples_tissue))
data.table::setnafill(
x = samples_tissue,
fill = 0,
cols = setdiff(names(samples_tissue), "Tissue")
)[
j = .(
Tissue = Tissue[order(- rowMeans(.SD) / sum(rowMeans(.SD)))],
.SD[order(- rowMeans(.SD) / sum(rowMeans(.SD)))]
),
.SDcols = gsub(".genes.results$", "", basename(files))
]
}
is.unique <- function(object) {
length(unique(summary.tissue(object)[["best_tissue"]])) == 1
}
summary.tissue <- function(object, ...) {
object[
j = data.table::data.table(
t(sapply(.SD, function(x) c(best_tissue = Tissue[which.max(x)], n_gene = x[which.max(x)]))),
keep.rownames = "sample_id"
),
.SDcols = setdiff(names(object), "Tissue")
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment