Skip to content

Instantly share code, notes, and snippets.

@wenjie1991
Created July 13, 2020 16:02
Show Gist options
  • Save wenjie1991/afeb50691de243666e8fb59a5701c0a9 to your computer and use it in GitHub Desktop.
Save wenjie1991/afeb50691de243666e8fb59a5701c0a9 to your computer and use it in GitHub Desktop.
PAN cancer gene expression
---
output:
html_document:
toc: true
theme: united
number_sections: true
always_allow_html: yes
---
```{r echo=F}
library(knitr)
opts_chunk$set(echo=T, TOC=T, warnings = F)
```
```{r}
library(data.table)
library(ggplot2)
library(plyr)
library(magrittr)
library(readr)
library(stringr)
```
# Prepare the data
Download data
```{r, eval=F}
cancer_list = readLines("../data/tumor_type_list.txt") %>% Filter(function(x) { x != "" }, .) %>% unique
gene_list = readLines("../data/OATP_list.txt") %>% Filter(function(x) { x != "" }, .) %>% unique
gene_list = c(gene_list, "GAPDH", "ACTB")
## create a fold to hold the data
dir.create("../data/OATP_TSVdb_PAN_cancer", showWarnings = F, recursive = T)
download_data = function(gene, cancer, dir) {
library(stringr)
cmd = str_glue("curl \"http://api.smallysun.com/sv_datatable?gene=THBS1&tumor={cancer}&cd=sampletype&area=exon&gene_sort={gene}\" | gzcat > {dir}/tcga_{cancer}_{gene}.tsv", gene = gene, cancer = cancer, dir = dir)
system(cmd)
}
for (i_cancer in cancer_list) {
for (i_gene in gene_list) {
print(i_gene)
print(i_cancer)
download_data(i_gene, i_cancer, "../data/OATP_TSVdb_PAN_cancer/")
Sys.sleep(0.1)
}
```
Prepare data table
```{r message=FALSE, warning=FALSE}
file_list = dir("../data/OATP_TSVdb_PAN_cancer", full=T)
d_l = lapply(file_list, fread)
names(d_l) = basename(file_list) %>% str_split_fixed("\\.", 2) %>% magrittr::extract(, 1)
d_l %<>% Filter(function(x) nrow(x) > 0, .)
d_l = lapply(d_l, function(d) {
d$expression = d[, grep("^gene", names(d), value=T), with=F] %>% unlist
if ("clinical_gender" %in% names(d)) {
d = d[, .(sampleID, sampletype = clinical_sampletype, gender = clinical_gender, expression)]
} else {
d = d[, .(sampleID, sampletype = clinical_sampletype, gender = "unknown", expression)]
}
d
})
d = ldply(d_l)
d_name_m = d$.id %>% str_split_fixed("_", 3)
d$tumor = d_name_m[, 2]
d$gene = d_name_m[, 3]
d_gene_per_person = d
write_tsv(d_gene_per_person, "../data/01_table.sample_gene_expression.tsv")
```
# Statistics
## All
Statistics table
```{r}
d = d_gene_per_person %>% data.table
d$expression = log2(d$expression + 1)
d = d[, .(
n = .N
, mean = mean(expression)
, sd = sd(expression)
, q5 = quantile(expression, 0.05)
, q25 = quantile(expression, 0.25)
, median = median(expression)
, q75 = quantile(expression, 0.75)
, q95 = quantile(expression, 0.95)
), by = .(tumor, gene, sampletype) ]
d_stat = d
write_tsv(d_stat, "../data/01_table.gene_tumor_expression.tsv")
```
Comparing
```{r}
## Pick the tumor with tumor - normal pair
paired_tumor_v = d[sampletype %in% c("Solid Tissue Normal", "Primary Solid Tumor"), .N, by = .(tumor, sampletype)][, .N, by = .(tumor)][N > 1, tumor]
d = d_gene_per_person %>% data.table
d$expression = log2(d$expression + 1)
d = d[tumor %in% paired_tumor_v][sampletype %in% c("Solid Tissue Normal", "Primary Solid Tumor")]
## Comparing the expression between tumor and normal
d = d[, .(
tumor_exp_mean = mean(expression[sampletype == "Primary Solid Tumor"])
, normal_exp_mean = mean(expression[sampletype == "Solid Tissue Normal"])
, wilcox.p = wilcox.test(expression[sampletype == "Primary Solid Tumor"], expression[sampletype == "Solid Tissue Normal"])$p.value
), by = .(tumor, gene)]
d[, log2FC := tumor_exp_mean - normal_exp_mean]
d = d[! (gene %in% c("GAPDH", "ACTB")) ]
d_diff = d
write_tsv(d_diff, "../data/01_table.DEGs.tsv")
## Visualization
theme0 <- theme_bw() + theme(
text = element_text(size = 15),
line = element_line(size = 1),
axis.line = element_line(size = 1),
axis.ticks.length = unit(3, units = "mm"),
axis.text.x = element_text(
margin = margin(t = 2, unit = "mm")
, angle = 60, vjust = 1, size = 15, hjust = 1),
axis.text.y = element_text(margin = margin(r = 3, l = 5, unit = "mm")),
legend.position = "right",
)
```
```{r, fig.height = 7, fig.width = 5}
g = ggplot(d_diff) + aes(x = gene, y = tumor, size = -log10(wilcox.p), color = log2FC)
p = g + geom_point() + theme0 +
scale_color_gradient2(low = "blue", mid = "white", high = "red") +
scale_size() + labs(x = "Gene", y = "Cancer Abbr", color = "logFC (T/N)", size = "-log10(p)")
p
```
```{r, fig.width = 9}
d = d_diff %>% melt(id.vars = c("gene", "tumor"), measure.vars = c("tumor_exp_mean", "normal_exp_mean"), variable.name = "sampletype", value.name = "expression")
g = ggplot(d) + aes(x = gene, y = tumor, size = expression, group = sampletype, color = sampletype)
p = g + geom_point(position = position_dodge(width = 0.5), alpha = 0.6) + theme0 +
scale_size() + labs(x = "Gene", y = "Cancer Abbr", color = "Sample type", size = "log2(RSEM + 1)")
p
```
## Female
Statistics table
```{r}
d = d_gene_per_person %>% data.table
d = d[gender == "FEMALE" | tumor %in% c("OV", "CESC", "UCEC", "UCS")]
d$expression = log2(d$expression + 1)
d = d[, .(
n = .N
, mean = mean(expression)
, sd = sd(expression)
, q5 = quantile(expression, 0.05)
, q25 = quantile(expression, 0.25)
, median = median(expression)
, q75 = quantile(expression, 0.75)
, q95 = quantile(expression, 0.95)
), by = .(tumor, gene, sampletype) ]
d_stat = d
write_tsv(d_stat, "../data/01_table.gene_tumor_expression.female.tsv")
```
Comparing
```{r}
## Pick the tumor with tumor - normal pair
paired_tumor_v = d[sampletype %in% c("Solid Tissue Normal", "Primary Solid Tumor"), .N, by = .(tumor, sampletype)][, .N, by = .(tumor)][N > 1, tumor]
d = d_gene_per_person %>% data.table
d = d[gender == "FEMALE" | tumor %in% c("OV", "CESC", "UCEC", "UCS")]
d$expression = log2(d$expression + 1)
d = d[tumor %in% paired_tumor_v][sampletype %in% c("Solid Tissue Normal", "Primary Solid Tumor")]
## Comparing the expression between tumor and normal
d = d[, .(
tumor_exp_mean = mean(expression[sampletype == "Primary Solid Tumor"])
, normal_exp_mean = mean(expression[sampletype == "Solid Tissue Normal"])
, wilcox.p = wilcox.test(expression[sampletype == "Primary Solid Tumor"], expression[sampletype == "Solid Tissue Normal"])$p.value
), by = .(tumor, gene)]
d[, log2FC := tumor_exp_mean - normal_exp_mean]
d = d[! (gene %in% c("GAPDH", "ACTB")) ]
d_diff = d
write_tsv(d_diff, "../data/01_table.DEGs.tsv")
## Visualization
theme0 <- theme_bw() + theme(
text = element_text(size = 15),
line = element_line(size = 1),
axis.line = element_line(size = 1),
axis.ticks.length = unit(3, units = "mm"),
axis.text.x = element_text(
margin = margin(t = 2, unit = "mm")
, angle = 60, vjust = 1, size = 15, hjust = 1),
axis.text.y = element_text(margin = margin(r = 3, l = 5, unit = "mm")),
legend.position = "right",
)
```
```{r, fig.height = 7, fig.width = 5}
g = ggplot(d_diff) + aes(x = gene, y = tumor, size = -log10(wilcox.p), color = log2FC)
p = g + geom_point() + theme0 +
scale_color_gradient2(low = "blue", mid = "white", high = "red") +
scale_size() + labs(x = "Gene", y = "Cancer Abbr", color = "logFC (T/N)", size = "-log10(p)")
p
```
```{r, fig.width= 9}
d = d_diff %>% melt(id.vars = c("gene", "tumor"), measure.vars = c("tumor_exp_mean", "normal_exp_mean"), variable.name = "sampletype", value.name = "expression")
g = ggplot(d) + aes(x = gene, y = tumor, size = expression, group = sampletype, color = sampletype)
p = g + geom_point(position = position_dodge(width = 0.5), alpha = 0.6) + theme0 +
scale_size() + labs(x = "Gene", y = "Cancer Abbr", color = "Sample type", size = "log2(RSEM + 1)")
p
```
## Male
Statistics table
```{r}
d = d_gene_per_person %>% data.table
d = d[gender == "MALE" | tumor %in% c("PRAD", "TGCT")]
d$expression = log2(d$expression + 1)
d = d[, .(
n = .N
, mean = mean(expression)
, sd = sd(expression)
, q5 = quantile(expression, 0.05)
, q25 = quantile(expression, 0.25)
, median = median(expression)
, q75 = quantile(expression, 0.75)
, q95 = quantile(expression, 0.95)
), by = .(tumor, gene, sampletype) ]
d_stat = d
write_tsv(d_stat, "../data/01_table.gene_tumor_expression.male.tsv")
```
Comparing
```{r}
## Pick the tumor with tumor - normal pair
paired_tumor_v = d[sampletype %in% c("Solid Tissue Normal", "Primary Solid Tumor"), .N, by = .(tumor, sampletype)][, .N, by = .(tumor)][N > 1, tumor]
d = d_gene_per_person %>% data.table
d = d[gender == "MALE" | (tumor %in% c("PRAD", "TGCT"))]
d$expression = log2(d$expression + 1)
d = d[tumor %in% paired_tumor_v][sampletype %in% c("Solid Tissue Normal", "Primary Solid Tumor")]
## Comparing the expression between tumor and normal
d = d[, .(
tumor_exp_mean = mean(expression[sampletype == "Primary Solid Tumor"])
, normal_exp_mean = mean(expression[sampletype == "Solid Tissue Normal"])
, wilcox.p = wilcox.test(expression[sampletype == "Primary Solid Tumor"], expression[sampletype == "Solid Tissue Normal"])$p.value
), by = .(tumor, gene)]
d[, log2FC := tumor_exp_mean - normal_exp_mean]
d = d[! (gene %in% c("GAPDH", "ACTB")) ]
d_diff = d
write_tsv(d_diff, "../data/01_table.DEGs.male.tsv")
## Visualization
theme0 <- theme_bw() + theme(
text = element_text(size = 15),
line = element_line(size = 1),
axis.line = element_line(size = 1),
axis.ticks.length = unit(3, units = "mm"),
axis.text.x = element_text(
margin = margin(t = 2, unit = "mm")
, angle = 60, vjust = 1, size = 15, hjust = 1),
axis.text.y = element_text(margin = margin(r = 3, l = 5, unit = "mm")),
legend.position = "right",
)
```
```{r, fig.height = 7, fig.width = 5}
g = ggplot(d_diff) + aes(x = gene, y = tumor, size = -log10(wilcox.p), color = log2FC)
p = g + geom_point() + theme0 +
scale_color_gradient2(low = "blue", mid = "white", high = "red") +
scale_size() + labs(x = "Gene", y = "Cancer Abbr", color = "logFC (T/N)", size = "-log10(p)")
p
```
```{r, fig.width = 9}
d = d_diff %>% melt(id.vars = c("gene", "tumor"), measure.vars = c("tumor_exp_mean", "normal_exp_mean"), variable.name = "sampletype", value.name = "expression")
g = ggplot(d) + aes(x = gene, y = tumor, size = expression, group = sampletype, color = sampletype)
p = g + geom_point(position = position_dodge(width = 0.5), alpha = 0.6) + theme0 +
scale_size() + labs(x = "Gene", y = "Cancer Abbr", color = "Sample type", size = "log2(RSEM + 1)")
p
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment