# makes no sense to 'fold' a single residue protein, so start at length 2 # benchmarked on 16GB RAM machine, 10 residue string took 1.5s cf. 5mins on a 2GB RAM Mac benchmarktable <- matrix(c(1:15,NA,0.004,0.004,0.008,0.014,0.022,0.048,0.135,0.474,1.584,4.854,15.65,48.2,151,469),ncol=2) colnames(benchmarktable) <- c("SequenceLength","RunTime") library(reshape2) benchmarkdata = melt(benchmarktable, id="SequenceLength") colnames(benchmarkdata)[c(1,3)] <- c("SequenceLength","RunTime") # can now plot the data... library(ggplot2) p <- ggplot(benchmarkdata[c(16:30),c(1,3)], aes_string(x="SequenceLength", y="RunTime")) + geom_line() + geom_point() + scale_x_continuous(breaks=function(limits) pretty(limits,6)) # export as BenchmarkPlot.png # to see more easily, view log-scaled, using default Loess smoothing (only interested in straight region of graph anyway) p <- ggplot(benchmarkdata[c(16:30),c(1,3)], aes_string(x="SequenceLength", y="RunTime")) + geom_smooth(fill=NA) + geom_point() + scale_y_log10() + scale_x_continuous(breaks=function(limits) pretty(limits,20)) # export as BenchmarkLogPlot.png # finding line of best fit - making a linear model logbenchmarkdf <- data.frame(benchmarktable[c(9:15),]) logbenchmarkdf$LogRunTime <- log10(logbenchmarkdf$RunTime) x <- logbenchmarkdf$SequenceLength y <- logbenchmarkdf$LogRunTime fit.lm <-lm(y ~ x) slope <- coef(fit.lm)[2] intercept <- coef(fit.lm)[1] # can compare real results with predictions from the linear model: logbenchmarkdf$PredictedLogRunTime <- predict(fit.lm) logbenchmarkdf$PredictedRunTime <- 10^(logbenchmarkdf$PredictedLogRunTime) as.double(predict(lm(y ~ x), data.frame(x = 16, se.fit = TRUE)[1])) # => 3.175846 # i.e. predicted 10^(3.175) seconds for sequence length 16. For a sequence length 20: 10^(as.double(predict(lm(y ~ x), data.frame(x = 20, se.fit = TRUE)[1])))/60/60 # => 40.85042 # Around 40 hours, as Excel's trendline predicted :-) # To run predictions on the whole sequence of 16 to 20, just use the equation as a new data.frame column logbenchmarkpredictions <- data.frame(c(16:20),NA) colnames(logbenchmarkpredictions) <- c("SequenceLength","PredictedRunTimeHours") logbenchmarkpredictions$PredictedRunTimeHours <- 10^(as.double(predict(lm(y ~ x), data.frame(x = logbenchmarkpredictions$SequenceLength, se.fit = TRUE)[1])))/60/60