Skip to content

Instantly share code, notes, and snippets.

@kstawiski
Last active April 7, 2022 13:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kstawiski/65bfbee52349c8b9790819c2cbc4ae3a to your computer and use it in GitHub Desktop.
Save kstawiski/65bfbee52349c8b9790819c2cbc4ae3a to your computer and use it in GitHub Desktop.
Sample size estmation for circulating miRNA biomarker study based on nonlinear weighted least squares optimization.
# This code is a part of paper entitled: "Deep learning-based integration of circulating and cellular miRNAs expression of pancreatic cancer"
# Author: Konrad Stawiski, MD, PhD (konrad@konsta.com.pl; https://konsta.com.pl)
# Based on https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/1472-6947-12-8
library(dplyr)
setwd("C:/Users/konra/OneDrive/Doktorat/Analiza/Szacowanie")
set.seed(1)
# Get exemplary file
Elias2017 <- data.table::fread("Elias2017.csv")
Y = as.factor(Elias2017$Class) # Class = Case or Control
X = as.matrix(dplyr::select(Elias2017, starts_with("hsa"))) # select all miRNAs
temp = dplyr::select(Elias2017, starts_with("hsa"), Class)
temp$Class = as.factor(temp$Class)
# Modifications start here
library(doParallel)
cl <- makePSOCKcluster(parallel::detectCores()-1)
registerDoParallel(cl)
library(caret)
library(nnet)
trControl = trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
search = "random")
wyniki = data.frame(`i` = numeric(), `meanAUC` = numeric(), `lower` = numeric(), `upper` = numeric())
for(i in 20:nrow(temp)){
try({
cat(i)
tempindex = sample(1:nrow(temp),i)
tempmodel = train(Class ~ ., data = temp[tempindex,], method = "nnet", family = "binomial", trControl = trControl, tuneLength=500)
m = mean(tempmodel$results$ROC, na.rm = T)
t = t.test(tempmodel$results$ROC, na.rm = T)
lower = t$conf.int[1]
upper = t$conf.int[2]
wyniki = rbind(wyniki, c(i,m,lower,upper)) })
}
colnames(wyniki) = c("i", "meanROC", "lower", "upper")
data.table::fwrite(wyniki,paste0("symulacja",as.numeric(Sys.time()),".csv"))
p<- ggplot(wyniki, aes(x=i, y=meanROC)) +
geom_line() +
geom_point()+
geom_errorbar(aes(ymin=lower, ymax=upper), width=.2,
position=position_dodge(0.05))
# Finished line plot
p+labs(title="Samples", x="Number of miRNAs", y = "mean AUC ROC (5x 10-fold CV)")+
theme_classic()
#######################
FitModel<- function(offset, X, Y, W, i, N,startParams)
{
#Data considering only points between offset and i
x<-X[offset:i];
y<-Y[offset:i];
w<-W[offset:i];
gradientF<-deriv3(~(1-a)-(b*(x^c)), c("a","b","c"), function(a,b,c,x) NULL); # fitting the model using nls
m<-nls(y~gradientF(a,b,c,x), start = startParams, weights=w,
control = list(maxiter=1000, warnOnly = TRUE),
algorithm = "port", upper = list(a=10, b = 10, c = -0.1),
lower = list(a = 0, b = 0, c=-10), data = data.frame(y=y, x=x))
#predict Y for sample sizes not used to fit the curve
#if all data was used to fit model, testing data = training data
#else, testing data = (total data - training data)
if (i==N){
testX<-X[(offset:i)];
testY<-Y[(offset:i)];
testW <- W[offset:i];
}
else{
testX<-X[((i+1):N)];
testY<-Y[((i+1):N)];
testW<-W[((i+1):N)];
}
#predictions on unseen data
#Get remaining X
prediction<-predict(m, list(x=testX)); #confidence intervals
se.fit <- sqrt(apply(attr(predict(m, list(x=testX)),"gradient"),1,
function(x) sum(vcov(m)*outer(x,x))));
prediction.ci <- prediction + outer(se.fit,qnorm(c(.5, .025,.975)));
predictY<-prediction.ci[,1];
predictY.lw<-prediction.ci[,2];
predictY.up<-prediction.ci[,3];
#Calculate residuals
if (i==N)
else
res<-rep(0,length(X)) res<-rep(0,length(X-i))
res<-(predictY-testY);
#Calculate Root Mean square error
rmseValue<-rmse(testY, predictY); #Calculate Absolute error
maeValue<-abse(testY, predictY);
}
#End function
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment