Skip to content

Instantly share code, notes, and snippets.

@rohanp
Created November 6, 2014 15:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rohanp/df8a5f44826ca2ff1ab7 to your computer and use it in GitHub Desktop.
Save rohanp/df8a5f44826ca2ff1ab7 to your computer and use it in GitHub Desktop.
"""
The code was modified so that you dont have to import any libraries or use external files :)
"""
import numpy as np
import sys
#import Bio.PDB
#from scipy import linalg
from time import clock
#########################################################
# Cython Stuff #
#########################################################
cimport numpy as np
cimport cython
from libc.math cimport sqrt
from libc.math cimport M_E
DTYPE = np.float64
ctypedef np.float64_t DTYPE_t
#########################################################
# Global Variables #
#########################################################
cdef float cutoff= .03
cdef np.ndarray derivativeEpsilonList= np.zeros(9, dtype=DTYPE)
cdef np.ndarray epsilonList= np.zeros(3, dtype=DTYPE)
cdef np.ndarray noiseFailure= np.empty(0, dtype=DTYPE)
#########################################################
# Functions #
#########################################################
@cython.boundscheck(False)
cdef calcSimilarityMatrix(N, atomMode):
cdef np.ndarray RMSD=np.zeros((N,N), dtype=DTYPE)
for i in range(N):
for j in range(i):
RMSD[i][j]=leastRMSD(i, j, atomMode)
RMSD[j][i]=RMSD[i][j]
return RMSD
@cython.boundscheck(False)
cdef DTYPE_t leastRMSD(i, j, atomMode):
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''
Calculates least RMSD between two structures using
BioPython
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''
##biopython omitted
return np.random.random()*3
@cython.boundscheck(False)
cdef DTYPE_t createEpsilonLists(np.ndarray[DTYPE_t, ndim=2] RMSD):
cdef DTYPE_t maxEpsilon=np.amax(RMSD)
print maxEpsilon
cdef DTYPE_t a=(3./7.)*maxEpsilon
cdef DTYPE_t b= (1./2.)*maxEpsilon
cdef DTYPE_t c=(4./7.)*maxEpsilon
epsilonList[0]= a
epsilonList[1]= b
epsilonList[2]= c
print(epsilonList)
for i,v in enumerate(epsilonList):
derivativeEpsilonList[i*3]= v-.001
derivativeEpsilonList[i*3+1]= v
derivativeEpsilonList[i*3+2]= v+.001
return maxEpsilon
@cython.boundscheck(False)
cdef np.ndarray calcEpsilonArray(np.ndarray[DTYPE_t, ndim=2] RMSD):
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''
1. Uses RMSD to get length of the available epsilons
2. Creates epsilon list and calculates optimal epsilon
value using getEpsilon()
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''
cdef unsigned int N=RMSD.shape[0]
cdef np.ndarray epsilonArray=np.zeros(N, dtype=DTYPE)
cdef unsigned int xi
for xi in range(N):
epsilonArray[xi]=getEpsilon(xi, RMSD, derivativeEpsilonList, N)
return epsilonArray
@cython.boundscheck(False)
cdef DTYPE_t getEpsilon(unsigned int xi, np.ndarray[DTYPE_t, ndim=2] RMSD, np.ndarray[DTYPE_t, ndim=1] derivativeEpsilonList, unsigned int N):
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''
1. Calculates sorted eigenvalues for three different
epsilons using SVD method
2. Inputs eigenvalue matrix into getIntrinsicScaleAndDim()
3. Returns optimal epsilon value
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''
cdef np.ndarray eigenvalues=np.zeros((len(derivativeEpsilonList),N), dtype= DTYPE)
cdef np.ndarray neighbors = np.empty(0, dtype=np.int)
cdef np.ndarray neighborsMatrix
cdef np.ndarray A
cdef unsigned int i, j, x, y
cdef DTYPE_t e, k
cdef DTYPE_t eps
for i,e in enumerate(derivativeEpsilonList):
neighbors = np.zeros(N)
#find all models which are within RMSD dist e of model xi
for j in xrange(N):
if(RMSD[xi,j]<=e+.01):
np.insert(neighbors, 0, j)
#print(neighbors)
if len(neighbors)>1:
neighborsLength=len(neighbors)
neighborsMatrix=np.zeros((neighborsLength,neighborsLength))
#creates matrix of neighbors
for x in range(neighborsLength):
for y in range(neighborsLength):
neighborsMatrix[x][y]=RMSD[x,neighbors[y]]
A = np.linalg.svd(neighborsMatrix, compute_uv=False)
for j,k in enumerate(A):
eigenvalues[i][j]=k*k
else:
print "maxEpsilon too small! Try again with a larger maxEpsilon value."
#epsilonFailure.append(str(xi)+str(e))
return derivativeEpsilonList[i+3]
#quit()
eps= getIntrinsicScaleAndDim(xi,eigenvalues)
return eps
@cython.boundscheck(False)
cdef DTYPE_t getIntrinsicScaleAndDim(xi, np.ndarray[DTYPE_t, ndim=2] eigenvalues):
cdef np.ndarray statusVectors=np.zeros((len(epsilonList), eigenvalues.shape[1] ), dtype = DTYPE)
cdef np.ndarray dStatusVectors=np.zeros((len(epsilonList), eigenvalues.shape[1]), dtype = DTYPE)
cdef np.ndarray derivativeArray=np.zeros((len(epsilonList),len(eigenvalues[0])), dtype=DTYPE)
cdef unsigned int cols= eigenvalues.shape[1]
cdef unsigned int i, j, k
cdef int maxStart
#calculating the status vectors for the three epsilon values.
#the eigenvalues matrix has 9 rows with one value slightly above and one slightly below each of 3 epsilons to calculate the derivative.
#the status vector matrix only has 3 rows, one for each epsilon
#b*3+1 scales the 0-2 range to get the indicies 1, 4, 7 to get the eigenvalues corresponding to the correct epsilon
#each element N in the status vector for an epsilon e is calculated by subtrating the Nth eigenvalue for that epsilon from the N+1th eigenvalue
for i in xrange(epsilonList.shape[0]):
for j in range(cols-1):
statusVectors[i][j]=eigenvalues[i*3+1][j+1]-eigenvalues[i*3+1][j]
#converting the status vector to discrete 1's and 0's
for i in xrange(epsilonList.shape[0]):
for j in xrange(cols-1):
if statusVectors[i][j]>2*statusVectors[i][j+1] and statusVectors[i][j]>2*statusVectors[i][j+2] and statusVectors[i][j]>2*statusVectors[i][j+3] and statusVectors[i][j]>2*statusVectors[i][j+4] and statusVectors[i][j]>2*statusVectors[i][j+5]:
dStatusVectors[i][j]=1
else:
dStatusVectors[i][j]=0
start=np.zeros(len(epsilonList),dtype=np.int)
#figure out seperation between noise and non-noise eigenvalues
for i in xrange(epsilonList.shape[0]):
for j in xrange(len(dStatusVectors[0])-2):
if(dStatusVectors[i][j]==1 and dStatusVectors[i][j+1]==1 and dStatusVectors[i][j+2]==0 and dStatusVectors[i][j+3]==0 and dStatusVectors[i][j+4]==0):
start[i]=(j+2)
#print "start row %s = %s"%(i, start)
break
if start[i]==0:
#noiseFailure.append("atom "+str(xi)+" epsilon "+str(epsilonList[i]))
return epsilonList[0]
maxStart=max(start)
#calc derivative array
for i in range(maxStart, len(eigenvalues[0])):
for k in range(0,len(eigenvalues)/3):
slicedEigenValueArray=eigenvalues[:,i]
slicedEigenValueArray=slicedEigenValueArray[k*3:k*3+3]
slicedEpsilonList=derivativeEpsilonList[k*3:k*3+3]
derivativeArray[k][i-maxStart]=calculateDerivative(slicedEigenValueArray,slicedEpsilonList)
#determine optimal epsilon value
for k in range(derivativeArray.shape[1]):
for i in range(derivativeArray.shape[0]):
convergence=True
for j in range(k,derivativeArray.shape[1]):
if derivativeArray[i][j]==0:
convergence=False
if convergence==True:
return epsilonList[i]
#print "ERROR"
#derivativeFailure.append("atom"+str(xi))
#derivativeFailure.append("atom"+str(xi))
@cython.boundscheck(False)
cdef np.ndarray calcTransitionMatrix(np.ndarray[DTYPE_t, ndim=2] RMSD, np.ndarray[DTYPE_t, ndim=1] epsilonArray):
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''
Constructs a Markov matrix P with three steps
1. Uses RMSD matrix and epsilon values to calculate kernal
matrix, K
2. Compute D using K matrix (summation), compute Ktilda
by using the formula Ktilda/sqrt(D*D), create Dtilda
matrix by computing summation
3. P = Ktilda/Dtilda
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''
#getting data from input file
cdef int N = len(RMSD)
cdef np.ndarray K=np.zeros((N,N), dtype=DTYPE)
cdef np.ndarray D=np.zeros(N, dtype=DTYPE)
#making sure matricies are the right size
#calculating K and D
for i in range(N):
for j in range(N):
K[i][j] = M_E**((-RMSD[i][j]**2)/(2*epsilonArray[i]*epsilonArray[j]))
D[i] += K[i][j]
#creating Ktilda matrix, Dtilda array, and outfile
cdef np.ndarray Ktilda=np.zeros((N,N), dtype=DTYPE)
cdef np.ndarray Dtilda=np.zeros(N, dtype=DTYPE)
#calculating Ktilda and Dtilda
for i in xrange(N):
for j in xrange(N):
Ktilda[i][j]=K[i][j]/sqrt(D[i]*D[j])
Dtilda[i]+=Ktilda[i][j]
#creating P matrix and writing P to outfile
cdef np.ndarray P=np.zeros((N,N), dtype=DTYPE)
for i in xrange(N):
for j in xrange(N):
P[i][j]=Ktilda[i][j]/Dtilda[i]
return P
#########################################################
# Mathematical Functions #
#########################################################
@cython.boundscheck(False)
cdef unsigned int calculateDerivative(np.ndarray[DTYPE_t, ndim=1] eigenValueArray, np.ndarray[DTYPE_t, ndim=1] epsilonArray):
derivative=(eigenValueArray[2]-eigenValueArray[0])/(epsilonArray[2]-epsilonArray[0])
if derivative<cutoff:
return 1
else:
return 0
@cython.boundscheck(False)
cdef np.ndarray getProj(np.ndarray[DTYPE_t, ndim=2] P, np.ndarray[DTYPE_t, ndim=2] e_vecs):
cdef unsigned int i,j
projMatrix=np.zeros((P.shape[1],e_vecs.shape[0]))
for i in range(projMatrix.shape[0]):
for j in range(projMatrix.shape[1]):
projMatrix[i][j]=dotProd(P[i],e_vecs[j])
return projMatrix
@cython.boundscheck(False)
cdef np.ndarray dotProd(np.ndarray[DTYPE_t, ndim=2] Pi, np.ndarray[DTYPE_t, ndim=2] EVi):
EVi=np.transpose(EVi)
return np.dot(Pi, EVi)
@cython.boundscheck(False)
cdef np.ndarray calculateAccumulatedVariance(np.ndarray[DTYPE_t, ndim=1] eigenvalues):
#accumulate variances
accVars = eigenvalues.copy()
nevals = accVars.shape[0]
for i in range(1, nevals):
previous = accVars[i-1]
accVars[i] += previous
for i in range(0, nevals):
accVars[i] /= accVars[nevals-1]
accVars[i] *= 100.0
return accVars
@cython.boundscheck(False)
cdef np.ndarray removeNegatives(np.ndarray[DTYPE_t, ndim=1] evals_sorted):
evals_sorted_positive=np.zeros(len(evals_sorted))
for i,v in enumerate(evals_sorted):
if v>0:
evals_sorted_positive[i]=v
return evals_sorted_positive
@cython.boundscheck(False)
cdef eigDecomposeTransMatrix(np.ndarray[DTYPE_t, ndim=2] P, int order):
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''
Calculates eigenvalues and eigenvectors, and sorts them
Order=0 for ascending, Order = 1 for descending
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''
cdef np.ndarray eigenvalues,eigenvectors
eigenvalues, eigenvectors= np.linalg.eig(P)
cdef np.ndarray evals_sorted = np.zeros(eigenvalues.size, dtype=DTYPE)
cdef np.ndarray evecs_sorted = np.zeros((eigenvectors.shape[0], eigenvectors.shape[1]), dtype=DTYPE)
if order == 0:
sorted_evals_idx = eigenvalues.argsort()
evals_sorted = eigenvalues[sorted_evals_idx]
evecs_sorted = eigenvectors[sorted_evals_idx]
if order == 1:
sorted_evals_idx = eigenvalues.argsort()[::-1]
evals_sorted = eigenvalues[sorted_evals_idx]
evecs_sorted = eigenvectors[sorted_evals_idx]
return [evals_sorted,evecs_sorted]
#########################################################
# Main #
#########################################################
def main(inFileName, atomMode, cutoff):
startTime=clock()
cdef np.ndarray RMSD=calcSimilarityMatrix(50, atomMode)
maxEpsilon=createEpsilonLists(RMSD)
print "calculating epsilon array"
cdef np.ndarray epsilonArray=calcEpsilonArray(RMSD)
print "calculating transition matrix"
cdef np.ndarray P=calcTransitionMatrix(RMSD, epsilonArray)
print "eigen decomposition"
l=eigDecomposeTransMatrix(P,1)
cdef np.ndarray evals_sorted, evecs_sorted
evals_sorted=l[0]
evecs_sorted=l[1]
cdef np.ndarray evals_sorted_positive =removeNegatives(evals_sorted)
print "calculating accumulated variance"
cdef np.ndarray accumVar= calculateAccumulatedVariance(evals_sorted_positive)
endTime=clock()
elapsedTime= endTime - startTime
print "Elapsed time: %s seconds"%(elapsedTime)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment