Created
November 6, 2014 15:39
-
-
Save rohanp/df8a5f44826ca2ff1ab7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
The code was modified so that you dont have to import any libraries or use external files :) | |
""" | |
import numpy as np | |
import sys | |
#import Bio.PDB | |
#from scipy import linalg | |
from time import clock | |
######################################################### | |
# Cython Stuff # | |
######################################################### | |
cimport numpy as np | |
cimport cython | |
from libc.math cimport sqrt | |
from libc.math cimport M_E | |
DTYPE = np.float64 | |
ctypedef np.float64_t DTYPE_t | |
######################################################### | |
# Global Variables # | |
######################################################### | |
cdef float cutoff= .03 | |
cdef np.ndarray derivativeEpsilonList= np.zeros(9, dtype=DTYPE) | |
cdef np.ndarray epsilonList= np.zeros(3, dtype=DTYPE) | |
cdef np.ndarray noiseFailure= np.empty(0, dtype=DTYPE) | |
######################################################### | |
# Functions # | |
######################################################### | |
@cython.boundscheck(False) | |
cdef calcSimilarityMatrix(N, atomMode): | |
cdef np.ndarray RMSD=np.zeros((N,N), dtype=DTYPE) | |
for i in range(N): | |
for j in range(i): | |
RMSD[i][j]=leastRMSD(i, j, atomMode) | |
RMSD[j][i]=RMSD[i][j] | |
return RMSD | |
@cython.boundscheck(False) | |
cdef DTYPE_t leastRMSD(i, j, atomMode): | |
''''''''''''''''''''''''''''''''''''''''''''''''''''''''' | |
Calculates least RMSD between two structures using | |
BioPython | |
''''''''''''''''''''''''''''''''''''''''''''''''''''''''' | |
##biopython omitted | |
return np.random.random()*3 | |
@cython.boundscheck(False) | |
cdef DTYPE_t createEpsilonLists(np.ndarray[DTYPE_t, ndim=2] RMSD): | |
cdef DTYPE_t maxEpsilon=np.amax(RMSD) | |
print maxEpsilon | |
cdef DTYPE_t a=(3./7.)*maxEpsilon | |
cdef DTYPE_t b= (1./2.)*maxEpsilon | |
cdef DTYPE_t c=(4./7.)*maxEpsilon | |
epsilonList[0]= a | |
epsilonList[1]= b | |
epsilonList[2]= c | |
print(epsilonList) | |
for i,v in enumerate(epsilonList): | |
derivativeEpsilonList[i*3]= v-.001 | |
derivativeEpsilonList[i*3+1]= v | |
derivativeEpsilonList[i*3+2]= v+.001 | |
return maxEpsilon | |
@cython.boundscheck(False) | |
cdef np.ndarray calcEpsilonArray(np.ndarray[DTYPE_t, ndim=2] RMSD): | |
''''''''''''''''''''''''''''''''''''''''''''''''''''''''' | |
1. Uses RMSD to get length of the available epsilons | |
2. Creates epsilon list and calculates optimal epsilon | |
value using getEpsilon() | |
''''''''''''''''''''''''''''''''''''''''''''''''''''''''' | |
cdef unsigned int N=RMSD.shape[0] | |
cdef np.ndarray epsilonArray=np.zeros(N, dtype=DTYPE) | |
cdef unsigned int xi | |
for xi in range(N): | |
epsilonArray[xi]=getEpsilon(xi, RMSD, derivativeEpsilonList, N) | |
return epsilonArray | |
@cython.boundscheck(False) | |
cdef DTYPE_t getEpsilon(unsigned int xi, np.ndarray[DTYPE_t, ndim=2] RMSD, np.ndarray[DTYPE_t, ndim=1] derivativeEpsilonList, unsigned int N): | |
''''''''''''''''''''''''''''''''''''''''''''''''''''''''' | |
1. Calculates sorted eigenvalues for three different | |
epsilons using SVD method | |
2. Inputs eigenvalue matrix into getIntrinsicScaleAndDim() | |
3. Returns optimal epsilon value | |
''''''''''''''''''''''''''''''''''''''''''''''''''''''''' | |
cdef np.ndarray eigenvalues=np.zeros((len(derivativeEpsilonList),N), dtype= DTYPE) | |
cdef np.ndarray neighbors = np.empty(0, dtype=np.int) | |
cdef np.ndarray neighborsMatrix | |
cdef np.ndarray A | |
cdef unsigned int i, j, x, y | |
cdef DTYPE_t e, k | |
cdef DTYPE_t eps | |
for i,e in enumerate(derivativeEpsilonList): | |
neighbors = np.zeros(N) | |
#find all models which are within RMSD dist e of model xi | |
for j in xrange(N): | |
if(RMSD[xi,j]<=e+.01): | |
np.insert(neighbors, 0, j) | |
#print(neighbors) | |
if len(neighbors)>1: | |
neighborsLength=len(neighbors) | |
neighborsMatrix=np.zeros((neighborsLength,neighborsLength)) | |
#creates matrix of neighbors | |
for x in range(neighborsLength): | |
for y in range(neighborsLength): | |
neighborsMatrix[x][y]=RMSD[x,neighbors[y]] | |
A = np.linalg.svd(neighborsMatrix, compute_uv=False) | |
for j,k in enumerate(A): | |
eigenvalues[i][j]=k*k | |
else: | |
print "maxEpsilon too small! Try again with a larger maxEpsilon value." | |
#epsilonFailure.append(str(xi)+str(e)) | |
return derivativeEpsilonList[i+3] | |
#quit() | |
eps= getIntrinsicScaleAndDim(xi,eigenvalues) | |
return eps | |
@cython.boundscheck(False) | |
cdef DTYPE_t getIntrinsicScaleAndDim(xi, np.ndarray[DTYPE_t, ndim=2] eigenvalues): | |
cdef np.ndarray statusVectors=np.zeros((len(epsilonList), eigenvalues.shape[1] ), dtype = DTYPE) | |
cdef np.ndarray dStatusVectors=np.zeros((len(epsilonList), eigenvalues.shape[1]), dtype = DTYPE) | |
cdef np.ndarray derivativeArray=np.zeros((len(epsilonList),len(eigenvalues[0])), dtype=DTYPE) | |
cdef unsigned int cols= eigenvalues.shape[1] | |
cdef unsigned int i, j, k | |
cdef int maxStart | |
#calculating the status vectors for the three epsilon values. | |
#the eigenvalues matrix has 9 rows with one value slightly above and one slightly below each of 3 epsilons to calculate the derivative. | |
#the status vector matrix only has 3 rows, one for each epsilon | |
#b*3+1 scales the 0-2 range to get the indicies 1, 4, 7 to get the eigenvalues corresponding to the correct epsilon | |
#each element N in the status vector for an epsilon e is calculated by subtrating the Nth eigenvalue for that epsilon from the N+1th eigenvalue | |
for i in xrange(epsilonList.shape[0]): | |
for j in range(cols-1): | |
statusVectors[i][j]=eigenvalues[i*3+1][j+1]-eigenvalues[i*3+1][j] | |
#converting the status vector to discrete 1's and 0's | |
for i in xrange(epsilonList.shape[0]): | |
for j in xrange(cols-1): | |
if statusVectors[i][j]>2*statusVectors[i][j+1] and statusVectors[i][j]>2*statusVectors[i][j+2] and statusVectors[i][j]>2*statusVectors[i][j+3] and statusVectors[i][j]>2*statusVectors[i][j+4] and statusVectors[i][j]>2*statusVectors[i][j+5]: | |
dStatusVectors[i][j]=1 | |
else: | |
dStatusVectors[i][j]=0 | |
start=np.zeros(len(epsilonList),dtype=np.int) | |
#figure out seperation between noise and non-noise eigenvalues | |
for i in xrange(epsilonList.shape[0]): | |
for j in xrange(len(dStatusVectors[0])-2): | |
if(dStatusVectors[i][j]==1 and dStatusVectors[i][j+1]==1 and dStatusVectors[i][j+2]==0 and dStatusVectors[i][j+3]==0 and dStatusVectors[i][j+4]==0): | |
start[i]=(j+2) | |
#print "start row %s = %s"%(i, start) | |
break | |
if start[i]==0: | |
#noiseFailure.append("atom "+str(xi)+" epsilon "+str(epsilonList[i])) | |
return epsilonList[0] | |
maxStart=max(start) | |
#calc derivative array | |
for i in range(maxStart, len(eigenvalues[0])): | |
for k in range(0,len(eigenvalues)/3): | |
slicedEigenValueArray=eigenvalues[:,i] | |
slicedEigenValueArray=slicedEigenValueArray[k*3:k*3+3] | |
slicedEpsilonList=derivativeEpsilonList[k*3:k*3+3] | |
derivativeArray[k][i-maxStart]=calculateDerivative(slicedEigenValueArray,slicedEpsilonList) | |
#determine optimal epsilon value | |
for k in range(derivativeArray.shape[1]): | |
for i in range(derivativeArray.shape[0]): | |
convergence=True | |
for j in range(k,derivativeArray.shape[1]): | |
if derivativeArray[i][j]==0: | |
convergence=False | |
if convergence==True: | |
return epsilonList[i] | |
#print "ERROR" | |
#derivativeFailure.append("atom"+str(xi)) | |
#derivativeFailure.append("atom"+str(xi)) | |
@cython.boundscheck(False) | |
cdef np.ndarray calcTransitionMatrix(np.ndarray[DTYPE_t, ndim=2] RMSD, np.ndarray[DTYPE_t, ndim=1] epsilonArray): | |
''''''''''''''''''''''''''''''''''''''''''''''''''''''''' | |
Constructs a Markov matrix P with three steps | |
1. Uses RMSD matrix and epsilon values to calculate kernal | |
matrix, K | |
2. Compute D using K matrix (summation), compute Ktilda | |
by using the formula Ktilda/sqrt(D*D), create Dtilda | |
matrix by computing summation | |
3. P = Ktilda/Dtilda | |
''''''''''''''''''''''''''''''''''''''''''''''''''''''''' | |
#getting data from input file | |
cdef int N = len(RMSD) | |
cdef np.ndarray K=np.zeros((N,N), dtype=DTYPE) | |
cdef np.ndarray D=np.zeros(N, dtype=DTYPE) | |
#making sure matricies are the right size | |
#calculating K and D | |
for i in range(N): | |
for j in range(N): | |
K[i][j] = M_E**((-RMSD[i][j]**2)/(2*epsilonArray[i]*epsilonArray[j])) | |
D[i] += K[i][j] | |
#creating Ktilda matrix, Dtilda array, and outfile | |
cdef np.ndarray Ktilda=np.zeros((N,N), dtype=DTYPE) | |
cdef np.ndarray Dtilda=np.zeros(N, dtype=DTYPE) | |
#calculating Ktilda and Dtilda | |
for i in xrange(N): | |
for j in xrange(N): | |
Ktilda[i][j]=K[i][j]/sqrt(D[i]*D[j]) | |
Dtilda[i]+=Ktilda[i][j] | |
#creating P matrix and writing P to outfile | |
cdef np.ndarray P=np.zeros((N,N), dtype=DTYPE) | |
for i in xrange(N): | |
for j in xrange(N): | |
P[i][j]=Ktilda[i][j]/Dtilda[i] | |
return P | |
######################################################### | |
# Mathematical Functions # | |
######################################################### | |
@cython.boundscheck(False) | |
cdef unsigned int calculateDerivative(np.ndarray[DTYPE_t, ndim=1] eigenValueArray, np.ndarray[DTYPE_t, ndim=1] epsilonArray): | |
derivative=(eigenValueArray[2]-eigenValueArray[0])/(epsilonArray[2]-epsilonArray[0]) | |
if derivative<cutoff: | |
return 1 | |
else: | |
return 0 | |
@cython.boundscheck(False) | |
cdef np.ndarray getProj(np.ndarray[DTYPE_t, ndim=2] P, np.ndarray[DTYPE_t, ndim=2] e_vecs): | |
cdef unsigned int i,j | |
projMatrix=np.zeros((P.shape[1],e_vecs.shape[0])) | |
for i in range(projMatrix.shape[0]): | |
for j in range(projMatrix.shape[1]): | |
projMatrix[i][j]=dotProd(P[i],e_vecs[j]) | |
return projMatrix | |
@cython.boundscheck(False) | |
cdef np.ndarray dotProd(np.ndarray[DTYPE_t, ndim=2] Pi, np.ndarray[DTYPE_t, ndim=2] EVi): | |
EVi=np.transpose(EVi) | |
return np.dot(Pi, EVi) | |
@cython.boundscheck(False) | |
cdef np.ndarray calculateAccumulatedVariance(np.ndarray[DTYPE_t, ndim=1] eigenvalues): | |
#accumulate variances | |
accVars = eigenvalues.copy() | |
nevals = accVars.shape[0] | |
for i in range(1, nevals): | |
previous = accVars[i-1] | |
accVars[i] += previous | |
for i in range(0, nevals): | |
accVars[i] /= accVars[nevals-1] | |
accVars[i] *= 100.0 | |
return accVars | |
@cython.boundscheck(False) | |
cdef np.ndarray removeNegatives(np.ndarray[DTYPE_t, ndim=1] evals_sorted): | |
evals_sorted_positive=np.zeros(len(evals_sorted)) | |
for i,v in enumerate(evals_sorted): | |
if v>0: | |
evals_sorted_positive[i]=v | |
return evals_sorted_positive | |
@cython.boundscheck(False) | |
cdef eigDecomposeTransMatrix(np.ndarray[DTYPE_t, ndim=2] P, int order): | |
''''''''''''''''''''''''''''''''''''''''''''''''''''''''' | |
Calculates eigenvalues and eigenvectors, and sorts them | |
Order=0 for ascending, Order = 1 for descending | |
''''''''''''''''''''''''''''''''''''''''''''''''''''''''' | |
cdef np.ndarray eigenvalues,eigenvectors | |
eigenvalues, eigenvectors= np.linalg.eig(P) | |
cdef np.ndarray evals_sorted = np.zeros(eigenvalues.size, dtype=DTYPE) | |
cdef np.ndarray evecs_sorted = np.zeros((eigenvectors.shape[0], eigenvectors.shape[1]), dtype=DTYPE) | |
if order == 0: | |
sorted_evals_idx = eigenvalues.argsort() | |
evals_sorted = eigenvalues[sorted_evals_idx] | |
evecs_sorted = eigenvectors[sorted_evals_idx] | |
if order == 1: | |
sorted_evals_idx = eigenvalues.argsort()[::-1] | |
evals_sorted = eigenvalues[sorted_evals_idx] | |
evecs_sorted = eigenvectors[sorted_evals_idx] | |
return [evals_sorted,evecs_sorted] | |
######################################################### | |
# Main # | |
######################################################### | |
def main(inFileName, atomMode, cutoff): | |
startTime=clock() | |
cdef np.ndarray RMSD=calcSimilarityMatrix(50, atomMode) | |
maxEpsilon=createEpsilonLists(RMSD) | |
print "calculating epsilon array" | |
cdef np.ndarray epsilonArray=calcEpsilonArray(RMSD) | |
print "calculating transition matrix" | |
cdef np.ndarray P=calcTransitionMatrix(RMSD, epsilonArray) | |
print "eigen decomposition" | |
l=eigDecomposeTransMatrix(P,1) | |
cdef np.ndarray evals_sorted, evecs_sorted | |
evals_sorted=l[0] | |
evecs_sorted=l[1] | |
cdef np.ndarray evals_sorted_positive =removeNegatives(evals_sorted) | |
print "calculating accumulated variance" | |
cdef np.ndarray accumVar= calculateAccumulatedVariance(evals_sorted_positive) | |
endTime=clock() | |
elapsedTime= endTime - startTime | |
print "Elapsed time: %s seconds"%(elapsedTime) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment