Skip to content

Instantly share code, notes, and snippets.

@rmitsch
Last active November 7, 2017 12:05
Show Gist options
  • Save rmitsch/1a117e1d0a027ef36cf94b8f8146bce2 to your computer and use it in GitHub Desktop.
Save rmitsch/1a117e1d0a027ef36cf94b8f8146bce2 to your computer and use it in GitHub Desktop.
3_1_PreDeCon
"""
Exercise 3-1 for course Data Mining at University of Vienna, winter semester 2017.
Note that this code implementing some definitions for PreDeCon serves demonstration purposes and is not optimized for
performance.
"""
import numpy
import math
# 0. Define hyperparameters.
param_mu = 3
param_epsilon = 1
param_theta = 0.25
param_lambda = 1
param_kappa = 100
# ------------------------------------------------------- #
# 1. Define points.
# Sequence: Ascending from P1 to P12.
D = numpy.array([[1, 6], [2, 6], [3, 6], [4, 6], [5, 6], [6, 6], [7, 8], [7, 7], [7, 6], [7, 5], [7, 4], [7, 3]])
num_points = len(D)
num_dimensions = len(D[0])
# ------------------------------------------------------- #
# 3. Gather epsilon-neighbourhood for each point.
epsilon_neighbourhoods = []
# Iterate over points.
for i in range(0, num_points):
epsilon_neighbourhood_i = []
# Check all other points.
for j in range(0, num_points):
# Append to neighbourhood array if norm is smaller then epsilon.
if numpy.linalg.norm(D[i] - D[j]) <= param_epsilon:
epsilon_neighbourhood_i.append(j)
epsilon_neighbourhoods.append(epsilon_neighbourhood_i)
# ------------------------------------------------------- #
# 4. Calculate attribute variance.
attribute_variances = [[] for i in range(0, num_points)]
# Loop over dimensions/attributes.
for i_attribute in range(0, num_dimensions):
# Loop over points.
for i_datapoint in range(0, num_points):
# Get number of epsilon-neighbours.
number_of_epsilon_neighbours = len(epsilon_neighbourhoods[i_datapoint])
# Store sum of squared distances to neighbours.
distance_sum = 0
# Loop over neighbouring points.
for i_datapoint_neighbour in epsilon_neighbourhoods[i_datapoint]:
# Calculate distance between origin point and epsilon-neighbour.
distance_sum += (D[i_datapoint][i_attribute] - D[i_datapoint_neighbour][i_attribute]) ** 2
# Add variance to collection.
attribute_variances[i_datapoint].append(distance_sum / number_of_epsilon_neighbours)
# ------------------------------------------------------- #
# 5. Calculate subspace preference dimensionality.
subspace_preference_dimensionalities = []
for i_datapoint in range(0, num_points):
# Sum up number of this datapoint's attribute projections with variance <= theta.
subspace_preference_dimensionalities.append(
sum(att_var <= param_theta for att_var in attribute_variances[i_datapoint])
)
# ------------------------------------------------------- #
# 6. Calculate subspace preference vectors.
subspace_preference_vectors = [[] for i in range(0, num_points)]
for i_datapoint in range(0, num_points):
subspace_preference_vectors[i_datapoint] = \
[1 if att_var > param_theta else param_kappa for att_var in attribute_variances[i_datapoint]]
# ------------------------------------------------------- #
# 7. Calculate preference weighted similarity measures. between all datapoints.
pref_weighted_similarity_measures = [[] for i in range(0, num_points)]
for i_datapoint in range(0, num_points):
for j_datapoint in range(0, num_points):
if i_datapoint != j_datapoint:
pref_weighted_similarity_measures[i_datapoint].append(
math.sqrt(
sum(
(
1.0 / subspace_preference_vectors[i_datapoint][i_attribute] *
((D[i_datapoint][i_attribute] - D[j_datapoint][i_attribute]) ** 2)
)
for i_attribute in range(0, num_dimensions)
)
))
else:
pref_weighted_similarity_measures[i_datapoint].append(0.0)
# ------------------------------------------------------- #
# 8. Preference weighted epsilon-neighbourhood.
preference_weighted_neighbourhoods = [[] for i in range(0, num_points)]
for i_datapoint in range(0, num_points):
for j_datapoint in range(0, num_points):
dist_pref = max(
pref_weighted_similarity_measures[i_datapoint][j_datapoint],
pref_weighted_similarity_measures[j_datapoint][i_datapoint]
)
if dist_pref <= param_epsilon:
preference_weighted_neighbourhoods[i_datapoint].append(j_datapoint)
# ------------------------------------------------------- #
# 9. Determine core points.
core_points = []
for i_datapoint in range(0, num_points):
# Subspace preference dimensionality <= lambda?
if subspace_preference_dimensionalities[i_datapoint] <= param_lambda and \
len(preference_weighted_neighbourhoods[i_datapoint]) >= param_mu:
core_points.append(i_datapoint)
# ------------------------------------------------------- #
print("--------------------------------------------")
print("Core points:", core_points)
print("--------------------------------------------")
@rmitsch
Copy link
Author

rmitsch commented Nov 7, 2017

R3 allows to a point to be in it's own epsilon-neighbourhood (as opposed to R2).
Results:

  • R3: Core points: [1, 2, 3, 4, 5, 7, 8, 9, 10]
  • R2: Core points: [8]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment