Skip to content

Instantly share code, notes, and snippets.

@Ladsgroup
Created November 2, 2015 13:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Ladsgroup/94bdcdf85bb40162119c to your computer and use it in GitHub Desktop.
Save Ladsgroup/94bdcdf85bb40162119c to your computer and use it in GitHub Desktop.
Old k-means algorithm
[0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1
0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 1
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0
0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0
0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0
1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0
0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0
0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0]
import codecs
import math
import sklearn.cluster
import matplotlib.pyplot as plt
from collections import defaultdict
from scipy.stats import halfnorm
x = set()
c = 0
path = '/home/amir/sigclust/enwiki_data/data2.tsv'
with codecs.open(path, 'r', 'utf-8') as f:
for line in f:
line = line.replace('\n', '')
features = []
for feature in line.split('\t'):
if feature == 'False':
features.append(0)
elif feature == 'True':
features.append(1)
else:
features.append(float(feature))
if features[-1] != 1:
continue
c += 1
x.add(tuple(features[1:-1]))
print(len(x))
def mean_func(gen):
mean = 0
c = 0
if not gen:
return 0
for case in gen:
c += 1
mean += case
return mean/float(c)
def std(gen, mean=None):
if not gen:
return 0
if not mean:
mean = mean_func(gen)
variance = 0
c = 0
for case in gen:
c += 1
variance += (case - mean)**2
return math.sqrt(variance / float(c))
x_for_scaling = {}
for case in x:
for i in range(len(case)):
x_for_scaling[i] = x_for_scaling.get(i, []) + [case[i]]
mean_and_std = {}
for i in x_for_scaling:
mean = mean_func(x_for_scaling[i])
std_var = std(x_for_scaling[i], mean)
mean_and_std[i] = (mean, std_var)
training_set = set()
for case in x:
new_case = []
for i in range(len(case)):
new_case.append((case[i] - mean_and_std[i][0])/mean_and_std[i][1])
training_set.add(tuple(new_case))
cost_function = {}
res_for_plot = []
for n in range(1, 12):
classi = sklearn.cluster.KMeans(n_clusters=n)
training_set = list(training_set)
res = classi.fit_transform(training_set)
cost_temp = 0
dist = defaultdict(list)
for i in range(len(res)):
case = list(res[i])
cost_temp += min(case)
dist[case.index(min(case))].append(training_set[i])
if n == 2:
print(classi.labels_)
cost_function[n] = cost_temp / len(res)
res_for_plot.append(cost_function[n])
ones = 0
zeros = 0
if n == 10:
pass
print(cost_function)
for i in range(9):
print(i+2,'-', i+1, ':', cost_function[i+2] - cost_function[i+1])
plt.plot(list(range(1, 12)), res_for_plot)
plt.ylabel('Cost function')
plt.xlabel('Number of clusters')
plt.title('Cost function per number of clusters in reverted edits in %s.wp' % path.split('/')[-1][:2])
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment