Skip to content

Instantly share code, notes, and snippets.

@rmitsch
Last active October 18, 2017 13:37
Show Gist options
  • Save rmitsch/8a2be762de94cbfe8051749421a0cd0b to your computer and use it in GitHub Desktop.
Save rmitsch/8a2be762de94cbfe8051749421a0cd0b to your computer and use it in GitHub Desktop.
Implementation of exercise 2-3 for VU Data Mining at University of Vienna.
import numpy
import scipy
import scipy.io.arff
import sklearn.feature_selection as skfs
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import normalized_mutual_info_score
def arff_to_ndarray(path_to_arff_file):
"""
Converts content of .arff file to numpy matrix.
:param path_to_arff_file:
:return: numpy.ndarray matrix for feature values, vector with labels/classes.
"""
# Load as numpy objects.
data, meta = scipy.io.arff.loadarff(path_to_arff_file)
# Extract labels.
labels = data[meta.names()[-1]]
# Discard last column (labels).
data = data[meta.names()[:-1]]
# Use view(numpy.float) to convert elements from numpy.void to numpy.float. Use -1 to let numpy infer the shape.
data = data.view(numpy.float).reshape(data.shape + (-1,))
return data, labels
# Load dataset.
X, y = arff_to_ndarray("diabetes.arff")
# 1. Forward selection, mesaured by chi2-measure (independency of feature variable from class variable).
X_chi2 = skfs.SelectKBest(skfs.chi2, k=2).fit_transform(X, y)
# 2. Forward selection, measured by mutual information (similarity of feature and target variable).
X_mi = skfs.SelectKBest(skfs.mutual_info_classif, k=2).fit_transform(X, y)
# 3. Apply k-means with first two data points as initial clusters.
kmeans = KMeans(n_clusters=2, init=X[:2], n_init=1, max_iter=10000, n_jobs=2).fit(X)
kmeans_chi2 = KMeans(n_clusters=2, init=X_chi2[:2], n_init=1, max_iter=10000, n_jobs=2).fit(X_chi2)
kmeans_mi = KMeans(n_clusters=2, init=X_mi[:2], n_init=1, max_iter=10000, n_jobs=2).fit(X_mi)
# 4. Evaluate results using NMI between actual predicted labels with different feature selection approaches.
# Convert textual labels to 1/0 associations.
y_1_to_pos = numpy.asarray([1 if label == b'tested_positive' else 0 for label in y])
y_0_to_pos = numpy.asarray([0 if label == b'tested_positive' else 1 for label in y])
# Evaluate results for original dataset with all features.
print(normalized_mutual_info_score(y_0_to_pos, kmeans.labels_)) # 0.0297237939655
# Evaluate results for chi2-reduced dataset.
print(normalized_mutual_info_score(y_0_to_pos, kmeans_chi2.labels_)) # 0.0303491479031
# Evaluate results for MI-reduced dataset.
print(normalized_mutual_info_score(y_0_to_pos, kmeans_mi.labels_)) # 0.139509464881
# Cross-check: Calculate accuracy.
print("---------------")
print(numpy.sum(y_0_to_pos == kmeans.labels_) / len(y)) # 0.66015625
print(numpy.sum(y_1_to_pos == kmeans.labels_) / len(y)) # 0.33984375
print(numpy.sum(y_0_to_pos == kmeans_chi2.labels_) / len(y)) # 0.66015625
print(numpy.sum(y_1_to_pos == kmeans_chi2.labels_) / len(y)) # 0.33984375
print(numpy.sum(y_0_to_pos == kmeans_mi.labels_) / len(y)) # 0.66015625
print(numpy.sum(y_1_to_pos == kmeans_mi.labels_) / len(y)) # 0.33984375
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment