# makes no sense to 'fold' a single residue protein, so start at length 2
# benchmarked on 16GB RAM machine, 10 residue string took 1.5s cf. 5mins on a 2GB RAM Mac

benchmarktable <- matrix(c(1:15,NA,0.004,0.004,0.008,0.014,0.022,0.048,0.135,0.474,1.584,4.854,15.65,48.2,151,469),ncol=2)
colnames(benchmarktable) <- c("SequenceLength","RunTime")

library(reshape2)
benchmarkdata = melt(benchmarktable, id="SequenceLength")
colnames(benchmarkdata)[c(1,3)] <- c("SequenceLength","RunTime")

# can now plot the data...

library(ggplot2)

p <- ggplot(benchmarkdata[c(16:30),c(1,3)], aes_string(x="SequenceLength", y="RunTime")) + geom_line() + geom_point() + scale_x_continuous(breaks=function(limits) pretty(limits,6))
# export as BenchmarkPlot.png

# to see more easily, view log-scaled, using default Loess smoothing (only interested in straight region of graph anyway)

p <- ggplot(benchmarkdata[c(16:30),c(1,3)], aes_string(x="SequenceLength", y="RunTime")) + geom_smooth(fill=NA) + geom_point() + scale_y_log10() + scale_x_continuous(breaks=function(limits) pretty(limits,20))
# export as BenchmarkLogPlot.png

# finding line of best fit - making a linear model

logbenchmarkdf <- data.frame(benchmarktable[c(9:15),])
logbenchmarkdf$LogRunTime <- log10(logbenchmarkdf$RunTime)

x <- logbenchmarkdf$SequenceLength
y <- logbenchmarkdf$LogRunTime

fit.lm <-lm(y ~ x)
slope <- coef(fit.lm)[2]
intercept <- coef(fit.lm)[1]

# can compare real results with predictions from the linear model:

logbenchmarkdf$PredictedLogRunTime <- predict(fit.lm)
logbenchmarkdf$PredictedRunTime <- 10^(logbenchmarkdf$PredictedLogRunTime)

as.double(predict(lm(y ~ x), data.frame(x = 16, se.fit = TRUE)[1]))
# => 3.175846

# i.e. predicted 10^(3.175) seconds for sequence length 16. For a sequence length 20:

10^(as.double(predict(lm(y ~ x), data.frame(x = 20, se.fit = TRUE)[1])))/60/60
# => 40.85042

# Around 40 hours, as Excel's trendline predicted :-)

# To run predictions on the whole sequence of 16 to 20, just use the equation as a new data.frame column

logbenchmarkpredictions <- data.frame(c(16:20),NA)
colnames(logbenchmarkpredictions) <- c("SequenceLength","PredictedRunTimeHours")
logbenchmarkpredictions$PredictedRunTimeHours <- 10^(as.double(predict(lm(y ~ x), data.frame(x = logbenchmarkpredictions$SequenceLength, se.fit = TRUE)[1])))/60/60