Skip to content

Instantly share code, notes, and snippets.

@rmitsch
Created November 13, 2017 20:41
Show Gist options
  • Save rmitsch/7bb1364a4c6d637cfcb5d2b4a055cf75 to your computer and use it in GitHub Desktop.
Save rmitsch/7bb1364a4c6d637cfcb5d2b4a055cf75 to your computer and use it in GitHub Desktop.
import sys
import numpy as np
import matplotlib.pyplot as plt
from numpy.linalg import eigh
import math
# reading csv Data into a File
def read_data(file):
f = open( file, 'r')
data = []
for line in f:
line = line.split(';')
del line[-1]
vec = np.zeros(len(line))
c = 0
for x in line:
vec[c] = x
c += 1
data.append(vec)
f.close()
return data
def eps_range(q, D, epsilon):
"""
(b)
:param q:
:param D:
:param epsilon:
:return:
"""
neighbourhoods = []
for neighbour_index in range(0, len(D)):
if np.linalg.norm(q - D[neighbour_index]) <= epsilon:
neighbourhoods.append(np.array(D[neighbour_index]))
return neighbourhoods
def derive_local_correlations(D, epsilon, kappa, delta):
"""
(c). See definition #10 and introduction to 3.3: "...which is determined by PCA of the points in the epsilon-
neighborhood of P." - eigenvalues and eigenvectors for PCA are calculated on point set of points' corresponding
epsilon-neighbourhood.
:param D:
:param epsilon:
:param kappa:
:param delta:
:return:
"""
local_correlations = []
for index in range(0, len(D)):
# Calculate eigenvalues and eigenvectors based of covariance matrix of D[i]'s neighbourhood.
eigenvalues, eigenvectors = eigh(
# rowvar = False since rows correspond to datapoints/observatrions and columns to features/variable.
np.cov(eps_range(D[index], D, epsilon), rowvar=False)
)
# Invert normalized eigenvalues.
# Note that eigenvalues have to be divided by their maximum, which is conveniently not described in detail in
# the actual definition.
new_eigenvalues = np.array(
[1 if normed_eigenvalue > delta else kappa
for normed_eigenvalue in np.divide(eigenvalues, np.max(eigenvalues))]
)
correlation_similarity_matrix = eigenvectors @ np.diag(new_eigenvalues) @ np.transpose(eigenvectors)
local_correlations.append(correlation_similarity_matrix)
# print("Correlation similarity matrix:\n", correlation_similarity_matrix)
return local_correlations
def correlation_dist(x, S1, y, S2):
"""
(d) See definition #10 (correlation similarity matrix).
Assuming max() is the operation for combining distances and correlation similarity matrix equals "local distance
matrix".
:param x:
:param S1:
:param y:
:param S2:
:return:
"""
return max(
calculate_correlation_similarity(x, y, S1),
calculate_correlation_similarity(y, x, S2)
)
def calculate_correlation_similarity(x, y, correlation_similarity_matrix_for_x):
"""
Calculates correlation similarity used for correlation distance (def. #10).
:param x:
:param y:
:param correlation_similarity_matrix_for_x:
:return:
"""
diff_xy = np.subtract(x, y)
# @ for vector-matrix multiplication.
return np.sqrt(diff_xy @ correlation_similarity_matrix_for_x @ np.transpose(diff_xy))
def corr_eps_range(query_id, D, epsilon, mats):
"""
(e) See definition #11. Assuming corr_eps_range is to return the correlation epsilon-neighbourhood, since not properly documented in
assignment.
:param query_id:
:param D:
:param epsilon:
:param mats:
:return:
"""
corr_eps_neighbourhood = []
for index in range(1, len(D)):
correlation_distance = correlation_dist(
D[query_id],
mats[query_id],
D[index],
mats[index]
)
if correlation_distance <= epsilon:
corr_eps_neighbourhood.append(index)
return corr_eps_neighbourhood
def expand(outer,D,eps, minPts, cur_cluster_ID,mats,processed):
seed = corr_eps_range(outer, D, eps, mats)
# return if no core point
if len(seed) < minPts:
return False
# outer is core point
processed[outer] = cur_cluster_ID
while seed:
q = seed.pop()
if processed[q] == 0:
range = corr_eps_range(q,D,eps,mats)
if len(range) >= minPts:
for x in range:
seed.append(x)
processed[q] = cur_cluster_ID
return True
def algorithm4c(eps, minpts, kappa, delta):
# Note: Didn't run with Python 3.5, hence the variable definition was moved outside the function.
global processed, cur_cluster_ID
# build local distance measures
cor_mat = derive_local_correlations(D, eps, kappa, delta)
print('Finished preprocessing')
# run 4C
processed = np.zeros(n)
cur_cluster_ID = 1
for outer in range(0, n):
if processed[outer] == 0:
if expand(outer, D, eps, minpts, cur_cluster_ID, cor_mat, processed):
print('Found cluster %s' % cur_cluster_ID)
cur_cluster_ID += 1
else:
processed[outer] = -1
return processed
################################
# Prepare and preprocess data.
################################
# read data
D = read_data('data.csv')
# set dimensionality d and number of instances n
d = np.shape(D[0])[0]
n = len(D)
print(n)
print(d)
xs = [x[0] for x in D]
ys = [x[1] for x in D]
# plt.plot(xs, ys, 'b.')
# plt.show()
# Declare global variables.
processed = None
cur_cluster_ID = None
################################
# Execute 4C.
################################
# set of local parameters
minpts = 20
eps = 0.05
kappa = 50
results = []
# Prepare plotting.
cluster_design = ['r.','b.','g.','y.','ro','bo','go','yo','r+','b+','g+','y+','r-','b-','g-','y-']
run_index = 0
delta_values = [1e-10, 1e-1, 0.6, 1]
n_rows = math.ceil(math.sqrt(len(delta_values)))
n_cols = math.ceil(len(delta_values) / n_rows)
fig, subplots = plt.subplots(n_rows, n_cols, sharex=True, sharey=True, squeeze=False)
for delta in delta_values:
results.append(algorithm4c(eps, minpts, kappa, delta))
# Set subplot's title.
subplots[int(run_index / n_rows), run_index % n_rows].set_title(str(delta))
for c in range(1, cur_cluster_ID):
x = []
y = []
for i in range(0, n):
if c == results[run_index][i]:
x.append(D[i][0])
y.append(D[i][1])
# Plot result for current cluster.
subplots[int(run_index / n_rows), run_index % n_rows].plot(x, y, cluster_design[(c - 1) % len(cluster_design)])
# Keep track of number of models run so far.
run_index += 1
# Show plots.
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment