Last active
August 29, 2015 14:10
-
-
Save lmmx/01c872cc050db52cb427 to your computer and use it in GitHub Desktop.
Using the Folding.py program provided at hpcuniversity.org/students/weeklyChallenge/57/ in the file at www.shodor.org/media/content//hpcu/resources/Folding.py - included below with the 20 residue string used in Lesk's Introduction to Protein Science, Figure 2.31
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# makes no sense to 'fold' a single residue protein, so start at length 2 | |
# benchmarked on 16GB RAM machine, 10 residue string took 1.5s cf. 5mins on a 2GB RAM Mac | |
benchmarktable <- matrix(c(1:15,NA,0.004,0.004,0.008,0.014,0.022,0.048,0.135,0.474,1.584,4.854,15.65,48.2,151,469),ncol=2) | |
colnames(benchmarktable) <- c("SequenceLength","RunTime") | |
library(reshape2) | |
benchmarkdata = melt(benchmarktable, id="SequenceLength") | |
colnames(benchmarkdata)[c(1,3)] <- c("SequenceLength","RunTime") | |
# can now plot the data... | |
library(ggplot2) | |
p <- ggplot(benchmarkdata[c(16:30),c(1,3)], aes_string(x="SequenceLength", y="RunTime")) + geom_line() + geom_point() + scale_x_continuous(breaks=function(limits) pretty(limits,6)) | |
# export as BenchmarkPlot.png | |
# to see more easily, view log-scaled, using default Loess smoothing (only interested in straight region of graph anyway) | |
p <- ggplot(benchmarkdata[c(16:30),c(1,3)], aes_string(x="SequenceLength", y="RunTime")) + geom_smooth(fill=NA) + geom_point() + scale_y_log10() + scale_x_continuous(breaks=function(limits) pretty(limits,20)) | |
# export as BenchmarkLogPlot.png | |
# finding line of best fit - making a linear model | |
logbenchmarkdf <- data.frame(benchmarktable[c(9:15),]) | |
logbenchmarkdf$LogRunTime <- log10(logbenchmarkdf$RunTime) | |
x <- logbenchmarkdf$SequenceLength | |
y <- logbenchmarkdf$LogRunTime | |
fit.lm <-lm(y ~ x) | |
slope <- coef(fit.lm)[2] | |
intercept <- coef(fit.lm)[1] | |
# can compare real results with predictions from the linear model: | |
logbenchmarkdf$PredictedLogRunTime <- predict(fit.lm) | |
logbenchmarkdf$PredictedRunTime <- 10^(logbenchmarkdf$PredictedLogRunTime) | |
as.double(predict(lm(y ~ x), data.frame(x = 16, se.fit = TRUE)[1])) | |
# => 3.175846 | |
# i.e. predicted 10^(3.175) seconds for sequence length 16. For a sequence length 20: | |
10^(as.double(predict(lm(y ~ x), data.frame(x = 20, se.fit = TRUE)[1])))/60/60 | |
# => 40.85042 | |
# Around 40 hours, as Excel's trendline predicted :-) | |
# To run predictions on the whole sequence of 16 to 20, just use the equation as a new data.frame column | |
logbenchmarkpredictions <- data.frame(c(16:20),NA) | |
colnames(logbenchmarkpredictions) <- c("SequenceLength","PredictedRunTimeHours") | |
logbenchmarkpredictions$PredictedRunTimeHours <- 10^(as.double(predict(lm(y ~ x), data.frame(x = logbenchmarkpredictions$SequenceLength, se.fit = TRUE)[1])))/60/60 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
##--------------Program Info:-----------------------------------------------## | |
# Folding but not your Laundry - Using the 2D HP model for showing how protein | |
# folding works | |
# | |
# File: folding.py | |
# | |
# By Mobeen Ludin and Aaron Weeden, 21 May 2013 | |
# | |
# High-level Algorithm: | |
# - Given a string | |
# - Generate each possible folding of the string | |
# - Count the number of H-H bonds for each folding of the string | |
# - Save the foldings with the greatest number of H-H bonds | |
# - Throw away the foldings we saved if we find a folding with a bigger | |
# number of H-H bonds | |
# - Print the foldings we saved when we are done | |
# | |
##------------------------------------------------------------------------------ | |
##----------Python Code for Protein Folding:----------------------------------- | |
# | |
import sys | |
#---Global variable initialization---# | |
EMPTY = ' ' # This represents an empty character in the grid | |
best_grids = [] # This is a list of the grids that contain the best foldings | |
#---Main function definition---# | |
def main(): | |
#--Local variable initialization--# | |
current_element_idx = 0 # Index of the current element in protein | |
protein = 'HPHPPHHPHPPHPHHPPHPH' # Protein we are folding | |
#protein = "HHPPHHHPHHPH" # Protein we are folding | |
current_grid = [] # Grid in which we are currently folding | |
current_num_H_bonds = max_num_H_bonds = 0 # Counters | |
print("Hello protein folders, beginning program") # Print a greeting | |
#-Fill the grid with empty characters-# | |
# For each row of the grid | |
for row in range(len(protein)*2-1): | |
# Create a new empty list for that row | |
current_grid.append([]) | |
# For each column of the grid | |
for col in range(len(protein)*2-1): | |
# Add an empty character at the given row/column | |
current_grid[row].append(EMPTY) | |
# Start the first protein element in the middle of the grid | |
current_row = current_col = int((len(protein)*2-1)/2) | |
# Recursively find the best foldings, filling best_grid | |
print "Best number of H-H bonds", fold( protein, max_num_H_bonds, current_element_idx, current_grid, current_row, current_col, ' ', current_num_H_bonds ) | |
#-Print the best grids-# | |
# For each of the best grids | |
for grid in range(len(best_grids)): | |
# Print the index of the grid | |
print "Grid ", grid, ":" | |
# For each row in the grid | |
for row in range(len(protein)*2-1): | |
# For each column in the grid | |
for col in range(len(protein)*2-1): | |
# Print the character at the given row and | |
# column | |
print best_grids[grid][row][col], | |
# Print a line in between the rows | |
print '' | |
#---Fold function definition---# | |
def fold( protein, max_num_H_bonds, current_element_idx, current_grid, current_row, current_col, direction, current_num_H_bonds ): | |
# Determine the new current row and column based on the current | |
# direction | |
if direction == 'R': | |
current_col +=1 | |
elif direction == 'D': | |
current_row +=1 | |
elif direction == 'L': | |
current_col -=1 | |
elif direction == 'U': | |
current_row -=1 | |
# If we are able to place an element at the current row and column | |
if current_grid[current_row][current_col] == EMPTY: | |
#-Make a copy of the current grid before we change it-# | |
new_grid = [] | |
for row in range(len(protein)*2-1): | |
new_grid.append([]) | |
for col in range(len(protein)*2-1): | |
new_grid[row].append(current_grid[row][col]) | |
# Place the protein in the new grid | |
new_grid[current_row][current_col] = protein[current_element_idx] | |
# Check for H-H bonds in the current fold | |
if protein[current_element_idx]=='H': | |
# Check to the left | |
if current_col > 0 and new_grid[current_row][current_col-1]=='H': | |
current_num_H_bonds +=1 | |
# Check above | |
if current_row > 0 and new_grid[current_row-1][current_col]=='H': | |
current_num_H_bonds +=1 | |
# Check to the right | |
if current_col < len(new_grid[current_row]) - 1 and new_grid[current_row][current_col+1]=='H': | |
current_num_H_bonds +=1 | |
# Check below | |
if current_row < len(new_grid) - 1 and new_grid[current_row+1][current_col]=='H': | |
current_num_H_bonds +=1 | |
# Move on to the next element index | |
current_element_idx +=1 | |
# If not end of string, choose each direction and recur | |
if current_element_idx != len(protein): | |
for direction in ['R', 'D', 'L', 'U']: | |
max_num_H_bonds = fold( protein, max_num_H_bonds, current_element_idx, new_grid, current_row, current_col, direction, current_num_H_bonds ) | |
else: | |
# If end of string, check if the current fold has more | |
# H-H bonds than the max we found before. | |
# if true update the max. If we have the same number of | |
# H-H bonds, append the current grid to the list of best | |
# grids | |
if current_num_H_bonds > max_num_H_bonds: | |
max_num_H_bonds = current_num_H_bonds | |
del best_grids[:] | |
best_grids.append(new_grid) | |
elif current_num_H_bonds == max_num_H_bonds: | |
best_grids.append(new_grid) | |
# Return the count of the maximum number of H-H bonds | |
return max_num_H_bonds | |
##---Call main function-----## | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment