Skip to content

Instantly share code, notes, and snippets.

@mofas
Last active February 8, 2019 01:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mofas/3fe8b13fee1be159dd4d6b4113f5f87e to your computer and use it in GitHub Desktop.
Save mofas/3fe8b13fee1be159dd4d6b4113f5f87e to your computer and use it in GitHub Desktop.
For Pan
import sys
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
def minidistance(S1, S2) :
N = len(S1)
M = len(S2)
matrix = np.zeros((N + 1, M + 1), dtype=int) #Build an alignment matrix
matrix[0][0] = 0
for i in range(1, N+1):
matrix[i][0] = matrix[0][0] + i # left sequence, S1
for j in range(1, M+1):
matrix[0][j] = matrix[0][0] + j # top sequence, S2
for i in range (1, N+1):
for j in range (1, M+1):
c = 0
if S1[i-1] != S2[j-1]:
c = 1
matrix[i][j] = min(matrix[i-1][j]+1, matrix[i][j-1]+1, matrix[i-1][j-1]+c)
# print(matrix)
return matrix[-1][-1]
def read_data(filename):
file = open(filename, "r")
seq_list = []
key = ''
currentSeq = ''
for line in file:
if line[0] == '>':
if currentSeq:
seq_list.append((key, currentSeq))
currentSeq = ''
key = line.rstrip()
else:
currentSeq += line.rstrip()
# put the last seq and key into seq_list
seq_list.append((key, currentSeq))
return seq_list
def get_pairwise_dist_matrix(data_set):
data_len = len(data_set)
ret = np.zeros(data_len*data_len).reshape(data_len, data_len)
for i in range(data_len-1):
print(i)
for j in range(i+1, data_len):
(key_1, seq_1) = data_set[i]
(key_2, seq_2) = data_set[j]
dist = minidistance(seq_1, seq_2)
ret[i][j] = dist
ret[j][i] = dist
return ret
def main(input_file_name):
data = read_data(input_file_name)
# calulate distance matrix
dist_matrix = get_pairwise_dist_matrix(data)
Z = linkage(dist_matrix)
# get labels from data
labels = [d[0] for d in data]
plt.figure()
dendrogram(
Z,
leaf_rotation=90., # rotates the x axis labels
leaf_font_size=8., # font size for the x axis labels
labels=labels,
)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sequence')
plt.ylabel('distance')
plt.savefig('cluster.png')
# TODO
# Put your PCA code here and plot
# plt.savefig('pca.png')
main(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment