rmitsch/4_2_4C.py

## 4_2_4C.py
import sys
import numpy as np
import matplotlib.pyplot as plt
from numpy.linalg import eigh
import math

# reading csv Data into a File
def read_data(file):
    f = open( file, 'r')
    data = []
    for line in f:
        line = line.split(';')
        del line[-1]
        vec = np.zeros(len(line))
        c = 0
        for x in line:
            vec[c] = x
            c += 1
        data.append(vec)
    f.close()
    return data


def eps_range(q, D, epsilon):
    """
    (b)
    :param q:
    :param D:
    :param epsilon:
    :return:
    """

    neighbourhoods = []
    for neighbour_index in range(0, len(D)):
        if np.linalg.norm(q - D[neighbour_index]) <= epsilon:
            neighbourhoods.append(np.array(D[neighbour_index]))

    return neighbourhoods


def derive_local_correlations(D, epsilon, kappa, delta):
    """
    (c). See definition #10 and introduction to 3.3: "...which is determined by PCA of the points in the epsilon-
    neighborhood of P." - eigenvalues and eigenvectors for PCA are calculated on point set of points' corresponding
    epsilon-neighbourhood.
    :param D:
    :param epsilon:
    :param kappa:
    :param delta:
    :return:
    """
    local_correlations = []

    for index in range(0, len(D)):
        # Calculate eigenvalues and eigenvectors based of covariance matrix of D[i]'s neighbourhood.
        eigenvalues, eigenvectors = eigh(
            # rowvar = False since rows correspond to datapoints/observatrions and columns to features/variable.
            np.cov(eps_range(D[index], D, epsilon), rowvar=False)
        )

        # Invert normalized eigenvalues.
        # Note that eigenvalues have to be divided by their maximum, which is conveniently not described in detail in
        # the actual definition.
        new_eigenvalues = np.array(
            [1 if normed_eigenvalue > delta else kappa
             for normed_eigenvalue in np.divide(eigenvalues, np.max(eigenvalues))]
        )

        correlation_similarity_matrix = eigenvectors @ np.diag(new_eigenvalues) @ np.transpose(eigenvectors)
        local_correlations.append(correlation_similarity_matrix)
        # print("Correlation similarity matrix:\n", correlation_similarity_matrix)

    return local_correlations


def correlation_dist(x, S1, y, S2):
    """
    (d) See definition #10 (correlation similarity matrix).
    Assuming max() is the operation for combining distances and correlation similarity matrix equals "local distance
    matrix".
    :param x:
    :param S1:
    :param y:
    :param S2:
    :return:
    """
    return max(
        calculate_correlation_similarity(x, y, S1),
        calculate_correlation_similarity(y, x, S2)
    )


def calculate_correlation_similarity(x, y, correlation_similarity_matrix_for_x):
    """
    Calculates correlation similarity used for correlation distance (def. #10).
    :param x:
    :param y:
    :param correlation_similarity_matrix_for_x:
    :return:
    """
    diff_xy = np.subtract(x, y)

    # @ for vector-matrix multiplication.
    return np.sqrt(diff_xy @ correlation_similarity_matrix_for_x @ np.transpose(diff_xy))


def corr_eps_range(query_id, D, epsilon, mats):
    """
    (e) See definition #11. Assuming corr_eps_range is to return the correlation epsilon-neighbourhood, since not properly documented in
    assignment.
    :param query_id:
    :param D:
    :param epsilon:
    :param mats:
    :return:
    """
    corr_eps_neighbourhood = []

    for index in range(1, len(D)):
        correlation_distance = correlation_dist(
            D[query_id],
            mats[query_id],
            D[index],
            mats[index]
        )

        if correlation_distance <= epsilon:
            corr_eps_neighbourhood.append(index)

    return corr_eps_neighbourhood


def expand(outer,D,eps, minPts, cur_cluster_ID,mats,processed):
    seed = corr_eps_range(outer, D, eps, mats)
    # return if no core point
    if len(seed) < minPts:
        return False

    # outer is core point
    processed[outer] = cur_cluster_ID
    while seed:
        q = seed.pop()
        if processed[q] == 0:
            range = corr_eps_range(q,D,eps,mats)
            if len(range) >= minPts:
                for x in range:
                    seed.append(x)
        processed[q] = cur_cluster_ID
    return True


def algorithm4c(eps, minpts, kappa, delta):
    # Note: Didn't run with Python 3.5, hence the variable definition was moved outside the function.
    global processed, cur_cluster_ID

    # build local distance measures
    cor_mat = derive_local_correlations(D, eps, kappa, delta)
    print('Finished preprocessing')

    # run 4C
    processed = np.zeros(n)
    cur_cluster_ID = 1
    for outer in range(0, n):
        if processed[outer] == 0:
            if expand(outer, D, eps, minpts, cur_cluster_ID, cor_mat, processed):
                print('Found cluster %s' % cur_cluster_ID)
                cur_cluster_ID += 1
            else:
                processed[outer] = -1

    return processed

################################
# Prepare and preprocess data.
################################

# read data
D = read_data('data.csv')
# set dimensionality d and number of instances n
d = np.shape(D[0])[0]
n = len(D)
print(n)
print(d)

xs = [x[0] for x in D]
ys = [x[1] for x in D]

# plt.plot(xs, ys, 'b.')
# plt.show()

# Declare global variables.
processed = None
cur_cluster_ID = None

################################
# Execute 4C.
################################

# set of local parameters
minpts = 20
eps = 0.05
kappa = 50
results = []

# Prepare plotting.
cluster_design = ['r.','b.','g.','y.','ro','bo','go','yo','r+','b+','g+','y+','r-','b-','g-','y-']
run_index = 0

delta_values = [1e-10, 1e-1, 0.6, 1]
n_rows = math.ceil(math.sqrt(len(delta_values)))
n_cols = math.ceil(len(delta_values) / n_rows)
fig, subplots = plt.subplots(n_rows, n_cols, sharex=True, sharey=True, squeeze=False)

for delta in delta_values:
    results.append(algorithm4c(eps, minpts, kappa, delta))

    # Set subplot's title.
    subplots[int(run_index / n_rows), run_index % n_rows].set_title(str(delta))

    for c in range(1, cur_cluster_ID):
        x = []
        y = []
        for i in range(0, n):
            if c == results[run_index][i]:
                x.append(D[i][0])
                y.append(D[i][1])

        # Plot result for current cluster.
        subplots[int(run_index / n_rows), run_index % n_rows].plot(x, y, cluster_design[(c - 1) % len(cluster_design)])

    # Keep track of number of models run so far.
    run_index += 1

# Show plots.
plt.show()
	import sys
	import numpy as np
	import matplotlib.pyplot as plt
	from numpy.linalg import eigh
	import math

	# reading csv Data into a File
	def read_data(file):
	f = open( file, 'r')
	data = []
	for line in f:
	line = line.split(';')
	del line[-1]
	vec = np.zeros(len(line))
	c = 0
	for x in line:
	vec[c] = x
	c += 1
	data.append(vec)
	f.close()
	return data


	def eps_range(q, D, epsilon):
	"""
	(b)
	:param q:
	:param D:
	:param epsilon:
	:return:
	"""

	neighbourhoods = []
	for neighbour_index in range(0, len(D)):
	if np.linalg.norm(q - D[neighbour_index]) <= epsilon:
	neighbourhoods.append(np.array(D[neighbour_index]))

	return neighbourhoods


	def derive_local_correlations(D, epsilon, kappa, delta):
	"""
	(c). See definition #10 and introduction to 3.3: "...which is determined by PCA of the points in the epsilon-
	neighborhood of P." - eigenvalues and eigenvectors for PCA are calculated on point set of points' corresponding
	epsilon-neighbourhood.
	:param D:
	:param epsilon:
	:param kappa:
	:param delta:
	:return:
	"""
	local_correlations = []

	for index in range(0, len(D)):
	# Calculate eigenvalues and eigenvectors based of covariance matrix of D[i]'s neighbourhood.
	eigenvalues, eigenvectors = eigh(
	# rowvar = False since rows correspond to datapoints/observatrions and columns to features/variable.
	np.cov(eps_range(D[index], D, epsilon), rowvar=False)
	)

	# Invert normalized eigenvalues.
	# Note that eigenvalues have to be divided by their maximum, which is conveniently not described in detail in
	# the actual definition.
	new_eigenvalues = np.array(
	[1 if normed_eigenvalue > delta else kappa
	for normed_eigenvalue in np.divide(eigenvalues, np.max(eigenvalues))]
	)

	correlation_similarity_matrix = eigenvectors @ np.diag(new_eigenvalues) @ np.transpose(eigenvectors)
	local_correlations.append(correlation_similarity_matrix)
	# print("Correlation similarity matrix:\n", correlation_similarity_matrix)

	return local_correlations


	def correlation_dist(x, S1, y, S2):
	"""
	(d) See definition #10 (correlation similarity matrix).
	Assuming max() is the operation for combining distances and correlation similarity matrix equals "local distance
	matrix".
	:param x:
	:param S1:
	:param y:
	:param S2:
	:return:
	"""
	return max(
	calculate_correlation_similarity(x, y, S1),
	calculate_correlation_similarity(y, x, S2)
	)


	def calculate_correlation_similarity(x, y, correlation_similarity_matrix_for_x):
	"""
	Calculates correlation similarity used for correlation distance (def. #10).
	:param x:
	:param y:
	:param correlation_similarity_matrix_for_x:
	:return:
	"""
	diff_xy = np.subtract(x, y)

	# @ for vector-matrix multiplication.
	return np.sqrt(diff_xy @ correlation_similarity_matrix_for_x @ np.transpose(diff_xy))


	def corr_eps_range(query_id, D, epsilon, mats):
	"""
	(e) See definition #11. Assuming corr_eps_range is to return the correlation epsilon-neighbourhood, since not properly documented in
	assignment.
	:param query_id:
	:param D:
	:param epsilon:
	:param mats:
	:return:
	"""
	corr_eps_neighbourhood = []

	for index in range(1, len(D)):
	correlation_distance = correlation_dist(
	D[query_id],
	mats[query_id],
	D[index],
	mats[index]
	)

	if correlation_distance <= epsilon:
	corr_eps_neighbourhood.append(index)

	return corr_eps_neighbourhood


	def expand(outer,D,eps, minPts, cur_cluster_ID,mats,processed):
	seed = corr_eps_range(outer, D, eps, mats)
	# return if no core point
	if len(seed) < minPts:
	return False

	# outer is core point
	processed[outer] = cur_cluster_ID
	while seed:
	q = seed.pop()
	if processed[q] == 0:
	range = corr_eps_range(q,D,eps,mats)
	if len(range) >= minPts:
	for x in range:
	seed.append(x)
	processed[q] = cur_cluster_ID
	return True


	def algorithm4c(eps, minpts, kappa, delta):
	# Note: Didn't run with Python 3.5, hence the variable definition was moved outside the function.
	global processed, cur_cluster_ID

	# build local distance measures
	cor_mat = derive_local_correlations(D, eps, kappa, delta)
	print('Finished preprocessing')

	# run 4C
	processed = np.zeros(n)
	cur_cluster_ID = 1
	for outer in range(0, n):
	if processed[outer] == 0:
	if expand(outer, D, eps, minpts, cur_cluster_ID, cor_mat, processed):
	print('Found cluster %s' % cur_cluster_ID)
	cur_cluster_ID += 1
	else:
	processed[outer] = -1

	return processed

	################################
	# Prepare and preprocess data.
	################################

	# read data
	D = read_data('data.csv')
	# set dimensionality d and number of instances n
	d = np.shape(D[0])[0]
	n = len(D)
	print(n)
	print(d)

	xs = [x[0] for x in D]
	ys = [x[1] for x in D]

	# plt.plot(xs, ys, 'b.')
	# plt.show()

	# Declare global variables.
	processed = None
	cur_cluster_ID = None

	################################
	# Execute 4C.
	################################

	# set of local parameters
	minpts = 20
	eps = 0.05
	kappa = 50
	results = []

	# Prepare plotting.
	cluster_design = ['r.','b.','g.','y.','ro','bo','go','yo','r+','b+','g+','y+','r-','b-','g-','y-']
	run_index = 0

	delta_values = [1e-10, 1e-1, 0.6, 1]
	n_rows = math.ceil(math.sqrt(len(delta_values)))
	n_cols = math.ceil(len(delta_values) / n_rows)
	fig, subplots = plt.subplots(n_rows, n_cols, sharex=True, sharey=True, squeeze=False)

	for delta in delta_values:
	results.append(algorithm4c(eps, minpts, kappa, delta))

	# Set subplot's title.
	subplots[int(run_index / n_rows), run_index % n_rows].set_title(str(delta))

	for c in range(1, cur_cluster_ID):
	x = []
	y = []
	for i in range(0, n):
	if c == results[run_index][i]:
	x.append(D[i][0])
	y.append(D[i][1])

	# Plot result for current cluster.
	subplots[int(run_index / n_rows), run_index % n_rows].plot(x, y, cluster_design[(c - 1) % len(cluster_design)])

	# Keep track of number of models run so far.
	run_index += 1

	# Show plots.
	plt.show()