Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save MarcinKosinski/bf8924c2ac1da613af675e66c6a12545 to your computer and use it in GitHub Desktop.
Save MarcinKosinski/bf8924c2ac1da613af675e66c6a12545 to your computer and use it in GitHub Desktop.
extractSurvival <- function(cohorts){
survivalData <- list()
for(i in cohorts){
get(paste0(i, ".clinical"), envir = .GlobalEnv) %>%
select(patient.bcr_patient_barcode,
patient.vital_status,
patient.days_to_last_followup,
patient.days_to_death ) %>%
mutate(bcr_patient_barcode = toupper(patient.bcr_patient_barcode),
patient.vital_status = ifelse(patient.vital_status %>%
as.character() =="dead",1,0),
barcode = patient.bcr_patient_barcode %>%
as.character(),
times = ifelse( !is.na(patient.days_to_last_followup),
patient.days_to_last_followup %>%
as.character() %>%
as.numeric(),
patient.days_to_death %>%
as.character() %>%
as.numeric() )
) %>%
filter(!is.na(times)) -> survivalData[[i]]
}
do.call(rbind,survivalData) %>%
select(bcr_patient_barcode, patient.vital_status, times) %>%
unique
}
extractMutations <- function(cohorts, prc){
mutationsData <- list()
for(i in cohorts){
get(paste0(i, ".mutations"), envir = .GlobalEnv) %>%
select(Hugo_Symbol, bcr_patient_barcode) %>%
filter(nchar(bcr_patient_barcode)==15) %>%
filter(substr(bcr_patient_barcode, 14, 15)=="01") %>%
unique -> mutationsData[[i]]
}
do.call(rbind,mutationsData) %>% unique -> mutationsData
mutationsData %>%
group_by(Hugo_Symbol) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
mutate(count_prc = count/length(unique(mutationsData$bcr_patient_barcode))) %>%
filter_(paste0("count_prc > ",prc)) %>%
select(Hugo_Symbol) %>%
unlist -> topGenes
mutationsData %>%
filter(Hugo_Symbol %in% topGenes) -> mutationsData_top
mutationsData_top %>%
dplyr::group_by(bcr_patient_barcode) %>%
dplyr::summarise(count = n()) %>%
group_by(count) %>%
summarise(total = n()) %>%
arrange(desc(count))
#
# mutationsData_top %>%
# spread(Hugo_Symbol, bcr_patient_barcode) -> mutationsData_top_sp
as.data.table(mutationsData_top) -> mutationsData_top_DT
dcast.data.table(mutationsData_top_DT, bcr_patient_barcode ~ Hugo_Symbol , fill = 0) %>%
as.data.frame -> mutationsData_top_dcasted
mutationsData_top_dcasted[,-1][mutationsData_top_dcasted[,-1] != "0"] <- 1
mutationsData_top_dcasted -> result
names(result) <- gsub(names(result),pattern = "-", replacement = "")
result
}
extractCohortIntersection <- function(){
data(package = "RTCGA.mutations")$results[,3] %>%
gsub(".mutations", "", x = .) -> mutations_data
data(package = "RTCGA.clinical")$results[,3] %>%
gsub(".clinical", "", x = .) -> clinical_data
intersect(mutations_data, clinical_data)
}
prepareCoxDataSplit <- function(mutationsData, survivalData, groups, seed = 4561){
mutationsData %>%
mutate(bcr_patient_barcode = substr(bcr_patient_barcode,1,12)) %>%
left_join(survivalData,
by = "bcr_patient_barcode") -> coxData
coxData <- coxData[, -c(1,2)]
coxData %>%
filter(times > 0) %>%
filter(!is.na(times)) -> coxData
apply(coxData[,-c(1092, 1093)], MARGIN = 2,function(x){
as.numeric(as.character(x))
}) -> coxData[,-c(1092, 1093)]
set.seed(seed)
sample(groups, replace = TRUE, size = 6085) -> groups
split(coxData, groups) #coxData_split
}
prepareForumlaSGD <- function(coxData){
as.formula(paste("Surv(times, patient.vital_status) ~ ",
paste(names(coxData[[1]])[-c(1092, 1093)],
collapse="+"), collapse = ""))
}
full_cox_loglik_matrix <- function(beta, x, censored){
order(x$times) -> order2
x[order2, ] -> xORD
censored[order2] -> censORD
sum(censORD*(beta%*%x[, -which(names(x)=='times')] -
log(cumsum(exp(beta1*rev(x1) + beta2*rev(x2))))))
}
library(dplyr)
library(RTCGA.clinical)
library(RTCGA.mutations)
library(data.table)
library(coxphSGD)
library(archivist)
(extractCohortIntersection() -> cohorts)
head(extractSurvival(cohorts) -> survivalData)
extractMutations(cohorts, 0.02) -> mutationsData
mutationsData[1:8, c(1,4,56,100,207,801)]
set.seed(4561)
prepareCoxDataSplit(mutationsData,survivalData, groups = 100) -> coxData_split
head(coxData_split[[1]][c(1,10), c(210,302,356,898,911,1092:1093)])
prepareForumlaSGD(coxData_split) -> formulaSGD
coxData_split[99:100] -> testCox
coxData_split[1:98] -> trainCox
coxphSGD(formulaSGD, data = trainCox, max.iter = 490) -> model_1_over_t
coxphSGD(formulaSGD, data = trainCox, max.iter = 490,
learningRates = function(t){1/(50*sqrt(t))}) -> model_1_over_50sqrt_t
coxphSGD(formulaSGD, data = trainCox, max.iter = 490,
learningRates = function(t){1/(100*sqrt(t))}) -> model_1_over_100sqrt_t
saveToRepo(testCox)
saveToRepo(trainCox)
saveToRepo(model_1_over_t)
saveToRepo(model_1_over_50sqrt_t)
saveToRepo(model_1_over_100sqrt_t)
model_1_over_t$coefficients[c(98, 196, 294, 392, 490)] -> coeffs_1_overt
do.call(rbind, coeffs_1_overt) %>%
as.data.frame %>%
mutate(epoka = 1:5) %>%
tidyr::gather(key=epoka) ->gat
names(gat)[2]<- "gen"
library(ggplot2)
gat %>%
ggplot(aes(value)) + geom_histogram(fill=4, col = 1, binwidth = 0.01) +
facet_grid(epoka~.) +
theme_classic(base_size = 15) +
ylab('liczność') + xlab('wartości współczynników modelu') +
ggtitle('Histogramy wartości współczynników modelu po kolejnych epokach') + coord_cartesian(xlim = c(-0.5,0.5)) -> fig1
model_1_over_50sqrt_t$coefficients[c(98, 196, 294, 392, 490)] -> coeffs_over_50sqrt_t
do.call(rbind, coeffs_over_50sqrt_t) %>%
as.data.frame %>%
mutate(epoka = 1:5) %>%
tidyr::gather(key=epoka) ->gat2
names(gat2)[2]<- "gen"
library(ggplot2)
gat2 %>%
ggplot(aes(value)) + geom_histogram(fill=5, col = 1, binwidth = 0.01) +
facet_grid(epoka~.) +
theme_classic(base_size = 15) +
ylab('liczność') + xlab('wartości współczynników modelu') +
ggtitle('Histogramy wartości współczynników modelu po kolejnych epokach') + coord_cartesian(xlim = c(-0.25,0.25)) -> fig2
model_1_over_100sqrt_t$coefficients[c(98, 196, 294, 392, 490)] -> coeffs_over_100sqrt_t
do.call(rbind, coeffs_over_100sqrt_t) %>%
as.data.frame %>%
mutate(epoka = 1:5) %>%
tidyr::gather(key=epoka) ->gat3
names(gat3)[2]<- "gen"
gat3 %>%
ggplot(aes(value)) + geom_histogram(fill=6, col = 1, binwidth = 0.01) +
facet_grid(epoka~.) +
theme_classic(base_size = 15) +
ylab('liczność') + xlab('wartości współczynników modelu') +
ggtitle('Histogramy wartości współczynników modelu po kolejnych epokach') + coord_cartesian(xlim = c(-0.1,0.1)) -> fig3
gat %>%
mutate(model = "model1") %>%
bind_rows(gat2 %>%
mutate(model = "model2")) %>%
bind_rows(gat3 %>%
mutate(model = "model3")) %>%
ggplot(aes(value)) + geom_histogram(fill=4, col = 1, binwidth = 0.001) +
facet_grid(epoka~model) +
theme_classic(base_size = 15) +
ylab('liczność') + xlab('wartość współczynnika modelu') +
ggtitle('Histogramy wartości współczynników modelu po kolejnych epokach') -> fig4
do.call(rbind, coeffs_over_100sqrt_t) %>% t %>% as.data.frame -> model3
do.call(rbind, coeffs_over_50sqrt_t) %>% t %>% as.data.frame -> model2
do.call(rbind, coeffs_1_overt) %>% t %>% as.data.frame -> model1
names(model3) <- c("epoka1", "epoka2","epoka3", "epoka4", "epoka5")
names(model2) <- c("epoka1", "epoka2","epoka3", "epoka4", "epoka5")
names(model1) <- c("epoka1", "epoka2","epoka3", "epoka4", "epoka5")
library(GGally)
ggpairs( title = 'Różnice między współczynnikami w kolejnych epokach',
model1,
upper = "blank",
lower = list(continuous = "points")
) -> fig5
saveToRepo(fig1) # 3dc2e14a037bbaae9e892dd255150c28
saveToRepo(fig2) # 6e5065abdbeae888a2836c1e59460424
saveToRepo(fig3) # ac9a06e9c6428581cdacb2c1e56e5b57
saveToRepo(fig4) # 60e327d310c600493c6c661f7330b868
saveToRepo(fig5) # db5b77ff56b7128244e88d905173cf22
pdf('hist_overt_t.pdf', width = 10, height = 8 )
fig2
dev.off()
alink('3dc2e14a037bbaae9e892dd255150c28', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex')
alink('6e5065abdbeae888a2836c1e59460424', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex')
alink('ac9a06e9c6428581cdacb2c1e56e5b57', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex')
alink('60e327d310c600493c6c661f7330b868', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex')
alink('db5b77ff56b7128244e88d905173cf22', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex')
names(tail(sort(abs(tail(model_1_over_t$coefficients,1)[[1]])),40)) -> top40
tail(model_1_over_t$coefficients,1)[[1]][which(names(tail(model_1_over_t$coefficients,1)[[1]]) %in% top40)] -> real_top40
data.frame(beta = sort(real_top40, decreasing = TRUE),
exp_beta = exp(sort(real_top40, decreasing = TRUE))) %>% xtable::xtable()
do.call(rbind, trainCox) -> trainCox_df
colMeans(trainCox_df[, -c(1092:1093)]) -> srednie_trainCox_df
data.frame( beta =tail(model_1_over_t$coefficients,1)[[1]],
beta_star = srednie_trainCox_df) -> wyniki
data.frame( gen = rownames(wyniki),
beta = round(wyniki[, 1], 3),
exp_beta = round(exp(wyniki[, 1]),2),
srednia = round(wyniki[, 2],3),
beta_star = round(wyniki[, 1]/wyniki[, 2], 2) ) %>%
arrange(desc(beta_star)) %>%
.[1:40,] %>% xtable::xtable()
aoptions('repoDir', getwd())
loadFromLocalRepo('446ac4dcb7d65bf39057bb341b296f1a')
do.call(rbind, model_1_over_t$coefficients) -> wspolczynniki
qplot(1:491,wspolczynniki[ ,which(colnames(wspolczynniki) == "BRAF")]) +
xlab('Krok algorytmu') +
ylab('Wartość współczynnika') +
theme_minimal(base_size = 20) +
ggtitle('Wartości współczynnika genu BRAF') -> fig8
qplot(1:491,wspolczynniki[ ,which(colnames(wspolczynniki) == "EGFR")]) +
xlab('Krok algorytmu') +
ylab('Wartość współczynnika') +
theme_minimal(base_size = 20) +
ggtitle('Wartości współczynnika genu EGFR') -> fig9
saveToRepo(fig8) # 3cd8e1f6da766d3028fc2d8f5fef220f
alink('3cd8e1f6da766d3028fc2d8f5fef220f', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex')
saveToRepo(fig9) # 0f5e8e306cc2cba11fceb2bb5f34cbeb
alink('0f5e8e306cc2cba11fceb2bb5f34cbeb', repo = 'coxphSGD', user = 'MarcinKosinski', format = 'latex')
pdf('fig8.pdf', width = 10, height = 8 )
fig8
dev.off()
pdf('fig9.pdf', width = 10, height = 8 )
fig9
dev.off()
library(archivist)
setLocalRepo(getwd())
loadFromLocalRepo('1a06bef4a60a237bb65ca3e2f3f23515')
loadFromLocalRepo('3eebc99bd231b16a3ea4dbeec9ab5edb')
do.call(rbind,testCox) -> testCox_binded
do.call(rbind,trainCox) -> trainCox_binded
library(dplyr)
library(survMisc)
trainCox_binded %>%
select(patient.vital_status, times, BRAF, EGFR) %>%
survfit( Surv(times, patient.vital_status) ~ BRAF, data = .) %>%
survMisc:::autoplot.survfit( titleSize=12, type="CI", palette = "Set1",
alpha = 0.7, survLineSize=2,
pX = 0.3, sigP =3, title = "Przeżycie a mutacja genu BRAF" ) -> km_plot
xlims <- c(0,5000)
library(ggplot2)
km_plot[[1]] <- km_plot[[1]] +
coord_cartesian(xlim = c(xlims[1],xlims[2])) +
theme_light(base_size = 22)
km_plot[[2]] <- km_plot[[2]] +
coord_cartesian(xlim = xlims)+
theme_light(base_size = 22)
pdf("BRAF.pdf", width = 10, height = 8)
survMisc::autoplot(km_plot)
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment