Skip to content

Instantly share code, notes, and snippets.

@Ladsgroup
Created August 27, 2015 18:59
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Ladsgroup/bd741d8e96d84264eef8 to your computer and use it in GitHub Desktop.
Save Ladsgroup/bd741d8e96d84264eef8 to your computer and use it in GitHub Desktop.
Clustering reverted edits in Wikipedia
import codecs
import math
import sklearn.cluster
import matplotlib.pyplot as plt
x = set()
c = 0
path = '/home/amir/Downloads/featuresetsforclustering/ptwiki.features_reverted.20k.tsv'
with codecs.open(path, 'r', 'utf-8') as f:
for line in f:
line = line.replace('\n', '')
features = []
for feature in line.split('\t'):
if feature == 'False':
features.append(0)
elif feature == 'True':
features.append(1)
else:
features.append(float(feature))
if features[-1] != 1:
continue
c += 1
x.add(tuple(features[:-1]))
print(len(x))
def mean_func(gen):
mean = 0
c = 0
if not gen:
return 0
for case in gen:
c += 1
mean += case
return mean/float(c)
def std(gen, mean=None):
if not gen:
return 0
if not mean:
mean = mean_func(gen)
variance = 0
c = 0
for case in gen:
c += 1
variance += (case - mean)**2
return math.sqrt(variance / float(c))
x_for_scaling = {}
for case in x:
for i in range(len(case)):
x_for_scaling[i] = x_for_scaling.get(i, []) + [case[i]]
mean_and_std = {}
for i in x_for_scaling:
mean = mean_func(x_for_scaling[i])
std_var = std(x_for_scaling[i], mean)
mean_and_std[i] = (mean, std_var)
training_set = set()
for case in x:
new_case = []
for i in range(len(case)):
new_case.append((case[i] - mean_and_std[i][0])/mean_and_std[i][1])
training_set.add(tuple(new_case))
cost_function = {}
res_for_plot = []
for n in range(1, 12):
classi = sklearn.cluster.KMeans(n_clusters=n)
res = classi.fit_transform(list(training_set))
cost_temp = 0
for case in res:
cost_temp += min(case)
cost_function[n] = cost_temp / len(res)
res_for_plot.append(cost_function[n])
if n == 2:
print(classi.cluster_centers_)
print(cost_function)
for i in range(9):
print i+2,'-', i+1, ':', cost_function[i+2] - cost_function[i+1]
plt.plot(list(range(1, 12)), res_for_plot)
plt.ylabel('Cost function')
plt.xlabel('Number of clusters')
plt.title('Cost function per number of clusters in reverted edits in %s.wp' % path.split('/')[-1][:2])
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment