kstawiski/SampleSizeEstimation.R

## SampleSizeEstimation.R
# This code is a part of paper entitled: "Deep learning-based integration of circulating and cellular miRNAs expression of pancreatic cancer"
# Author: Konrad Stawiski, MD, PhD (konrad@konsta.com.pl; https://konsta.com.pl)
# Based on https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/1472-6947-12-8

library(dplyr)
setwd("C:/Users/konra/OneDrive/Doktorat/Analiza/Szacowanie")
set.seed(1)

# Get exemplary file
Elias2017 <- data.table::fread("Elias2017.csv")
Y = as.factor(Elias2017$Class) # Class = Case or Control
X = as.matrix(dplyr::select(Elias2017, starts_with("hsa"))) # select all miRNAs
temp = dplyr::select(Elias2017, starts_with("hsa"), Class)
temp$Class = as.factor(temp$Class)

# Modifications start here
library(doParallel)
cl <- makePSOCKcluster(parallel::detectCores()-1)
registerDoParallel(cl)
library(caret)
library(nnet)
trControl = trainControl(method = "repeatedcv",
                         number = 10,
                         repeats = 5,
                         classProbs = TRUE,
                         summaryFunction = twoClassSummary,
                         search = "random")

wyniki = data.frame(`i` = numeric(), `meanAUC` = numeric(), `lower` = numeric(), `upper` = numeric())
for(i in 20:nrow(temp)){
  try({
  cat(i)
  tempindex = sample(1:nrow(temp),i)
  tempmodel = train(Class ~ ., data = temp[tempindex,], method = "nnet", family = "binomial", trControl = trControl, tuneLength=500)
  m = mean(tempmodel$results$ROC, na.rm = T)
  t = t.test(tempmodel$results$ROC, na.rm = T)
  lower = t$conf.int[1]
  upper = t$conf.int[2]
  wyniki = rbind(wyniki, c(i,m,lower,upper)) })
}
colnames(wyniki) = c("i", "meanROC", "lower", "upper")
data.table::fwrite(wyniki,paste0("symulacja",as.numeric(Sys.time()),".csv"))

p<- ggplot(wyniki, aes(x=i, y=meanROC)) +
  geom_line() +
  geom_point()+
  geom_errorbar(aes(ymin=lower, ymax=upper), width=.2,
                position=position_dodge(0.05))

# Finished line plot
p+labs(title="Samples", x="Number of miRNAs", y = "mean AUC ROC (5x 10-fold CV)")+
  theme_classic()


#######################
FitModel<- function(offset, X, Y, W, i, N,startParams)
{
  #Data considering only points between offset and i
  x<-X[offset:i];
  y<-Y[offset:i];
  w<-W[offset:i];

  gradientF<-deriv3(~(1-a)-(b*(x^c)), c("a","b","c"), function(a,b,c,x) NULL); # fitting the model using nls
  m<-nls(y~gradientF(a,b,c,x), start = startParams, weights=w,
         control = list(maxiter=1000, warnOnly = TRUE),
         algorithm = "port", upper = list(a=10, b = 10, c = -0.1),

         lower = list(a = 0, b = 0, c=-10), data = data.frame(y=y, x=x))

  #predict Y for sample sizes not used to fit the curve
  #if all data was used to fit model, testing data = training data
  #else, testing data	= (total data - training data)


  if (i==N){
    testX<-X[(offset:i)];
    testY<-Y[(offset:i)];
    testW <- W[offset:i];
  }


  else{
    testX<-X[((i+1):N)];
    testY<-Y[((i+1):N)];
    testW<-W[((i+1):N)];
  }

  #predictions on unseen data


  #Get remaining X


  prediction<-predict(m, list(x=testX)); #confidence intervals
  se.fit <- sqrt(apply(attr(predict(m, list(x=testX)),"gradient"),1,
                       function(x) sum(vcov(m)*outer(x,x))));

  prediction.ci <- prediction + outer(se.fit,qnorm(c(.5, .025,.975)));

  predictY<-prediction.ci[,1];
  predictY.lw<-prediction.ci[,2];
  predictY.up<-prediction.ci[,3];


  #Calculate residuals


  if (i==N)

    else
      res<-rep(0,length(X)) res<-rep(0,length(X-i))


  res<-(predictY-testY);


  #Calculate Root Mean square error
  rmseValue<-rmse(testY, predictY); #Calculate Absolute error
  maeValue<-abse(testY, predictY);

}

#End function
	# This code is a part of paper entitled: "Deep learning-based integration of circulating and cellular miRNAs expression of pancreatic cancer"
	# Author: Konrad Stawiski, MD, PhD (konrad@konsta.com.pl; https://konsta.com.pl)
	# Based on https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/1472-6947-12-8

	library(dplyr)
	setwd("C:/Users/konra/OneDrive/Doktorat/Analiza/Szacowanie")
	set.seed(1)

	# Get exemplary file
	Elias2017 <- data.table::fread("Elias2017.csv")
	Y = as.factor(Elias2017$Class) # Class = Case or Control
	X = as.matrix(dplyr::select(Elias2017, starts_with("hsa"))) # select all miRNAs
	temp = dplyr::select(Elias2017, starts_with("hsa"), Class)
	temp$Class = as.factor(temp$Class)

	# Modifications start here
	library(doParallel)
	cl <- makePSOCKcluster(parallel::detectCores()-1)
	registerDoParallel(cl)
	library(caret)
	library(nnet)
	trControl = trainControl(method = "repeatedcv",
	number = 10,
	repeats = 5,
	classProbs = TRUE,
	summaryFunction = twoClassSummary,
	search = "random")

	wyniki = data.frame(`i` = numeric(), `meanAUC` = numeric(), `lower` = numeric(), `upper` = numeric())
	for(i in 20:nrow(temp)){
	try({
	cat(i)
	tempindex = sample(1:nrow(temp),i)
	tempmodel = train(Class ~ ., data = temp[tempindex,], method = "nnet", family = "binomial", trControl = trControl, tuneLength=500)
	m = mean(tempmodel$results$ROC, na.rm = T)
	t = t.test(tempmodel$results$ROC, na.rm = T)
	lower = t$conf.int[1]
	upper = t$conf.int[2]
	wyniki = rbind(wyniki, c(i,m,lower,upper)) })
	}
	colnames(wyniki) = c("i", "meanROC", "lower", "upper")
	data.table::fwrite(wyniki,paste0("symulacja",as.numeric(Sys.time()),".csv"))

	p<- ggplot(wyniki, aes(x=i, y=meanROC)) +
	geom_line() +
	geom_point()+
	geom_errorbar(aes(ymin=lower, ymax=upper), width=.2,
	position=position_dodge(0.05))

	# Finished line plot
	p+labs(title="Samples", x="Number of miRNAs", y = "mean AUC ROC (5x 10-fold CV)")+
	theme_classic()



	#######################
	FitModel<- function(offset, X, Y, W, i, N,startParams)
	{
	#Data considering only points between offset and i
	x<-X[offset:i];
	y<-Y[offset:i];
	w<-W[offset:i];

	gradientF<-deriv3(~(1-a)-(b*(x^c)), c("a","b","c"), function(a,b,c,x) NULL); # fitting the model using nls
	m<-nls(y~gradientF(a,b,c,x), start = startParams, weights=w,
	control = list(maxiter=1000, warnOnly = TRUE),
	algorithm = "port", upper = list(a=10, b = 10, c = -0.1),

	lower = list(a = 0, b = 0, c=-10), data = data.frame(y=y, x=x))

	#predict Y for sample sizes not used to fit the curve
	#if all data was used to fit model, testing data = training data
	#else, testing data = (total data - training data)


	if (i==N){
	testX<-X[(offset:i)];
	testY<-Y[(offset:i)];
	testW <- W[offset:i];
	}


	else{
	testX<-X[((i+1):N)];
	testY<-Y[((i+1):N)];
	testW<-W[((i+1):N)];
	}

	#predictions on unseen data



	#Get remaining X


	prediction<-predict(m, list(x=testX)); #confidence intervals
	se.fit <- sqrt(apply(attr(predict(m, list(x=testX)),"gradient"),1,
	function(x) sum(vcov(m)*outer(x,x))));

	prediction.ci <- prediction + outer(se.fit,qnorm(c(.5, .025,.975)));

	predictY<-prediction.ci[,1];
	predictY.lw<-prediction.ci[,2];
	predictY.up<-prediction.ci[,3];




	#Calculate residuals




	if (i==N)

	else
	res<-rep(0,length(X)) res<-rep(0,length(X-i))


	res<-(predictY-testY);






	#Calculate Root Mean square error
	rmseValue<-rmse(testY, predictY); #Calculate Absolute error
	maeValue<-abse(testY, predictY);

	}

	#End function