Skip to content

Instantly share code, notes, and snippets.

@toinetoine
Created September 15, 2015 16:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save toinetoine/eb1f76efdea2d319e30d to your computer and use it in GitHub Desktop.
Save toinetoine/eb1f76efdea2d319e30d to your computer and use it in GitHub Desktop.
kmeans clafficiation with evenly-distributed initial mean selection for user pages liked data for http://stats.stackexchange.com/questions/167922/classify-users-by-the-pages-they-liked
import json
# read data from the json file
def read_data(filename):
# open the file and read data
with open(filename) as data_file:
data = json.load(data_file)
# for each item in the file add their id and data as a
# point in the points list
points = list()
for item in data["dataset"]:
point = {}
point["id"] = item["id"]
point["data"] = {}
for dimention in item["data"]:
point["data"][dimention] = 1.0
points.append(point)
return points
def get_magnitude(point):
magnitude = 0.0
data = point["data"]
for key in list(data.keys()):
magnitude += data[key]
return magnitude
# return distance between point_a and point_b as a scalar (absolute value)
def get_distance(point_a, point_b):
a_data = point_a["data"]
b_data = point_b["data"]
result_point = {}
result_point["id"] = "|" + point_a["id"] + "-" + point_b["id"] + "|"
result_point["data"] = {}
# for data in both a and b AND all data in a but not in b
for a_key in list(a_data.keys()):
if a_key not in list(b_data.keys()):
result_point["data"][a_key] = 1.0
else:
result_point["data"][a_key] = abs(a_data[a_key] - b_data[a_key])
# for data in b but not in a
for b_key in list(b_data.keys()):
if b_key not in list(a_data.keys()):
result_point["data"][b_key] = 1.0
return result_point
# return a points that's the result of point_a + point_b
def add_points(point_a, point_b):
a_data = point_a["data"]
b_data = point_b["data"]
result_point = {}
result_point["id"] = point_a["id"] + "+" + point_b["id"]
result_point["data"] = {}
# for data in both a and b AND all data in a but not in b
for a_key in list(a_data.keys()):
if a_key not in list(b_data.keys()):
result_point["data"][a_key] = a_data[a_key]
else:
result_point["data"][a_key] = (a_data[a_key] + b_data[a_key])
# for data in b but not in a
for b_key in list(b_data.keys()):
if b_key not in list(a_data.keys()):
result_point["data"][b_key] = b_data[b_key]
return result_point
# get the initial means
def get_inital_means(points, k):
# assert that there are more points than value k
assert (len(points) >= k)
ref_point = points[0]
distances = list()
# get the largest distance from the ref_point to another point
largest_distance = None
for p_i in range(len(points)):
distance = get_magnitude(get_distance(ref_point, points[p_i]))
if(largest_distance == None or distance > largest_distance):
largest_distance = distance
means = list()
non_means = points[:]
for k_i in range(k):
# get closest distance this interval of distance from the refrence point
closest_distance = None
p_i = 0
while(p_i < len(non_means)):
distance = get_magnitude(get_distance(ref_point, non_means[p_i]))
difference = abs(k_i*largest_distance/k - distance)
if closest_distance == None or difference < closest_distance["difference"]:
closest_distance = {}
closest_distance["difference"] = difference
closest_distance["point_index"] = p_i
p_i += 1
# add the closest point to the means list
means.append(non_means.pop(closest_distance["point_index"]))
# initially clusters only have the initial means in each
clusters = [list() for i in range(len(means))]
for i in range(len(means)): clusters[i].append(means[i])
# classify the rest of the data (the data that wasn't used as initial means)
final_means, clusters = classify(means, clusters, non_means)
return final_means, clusters
# classify each of the data points in data_points
# assumes there is an existing mean for each cluster
def classify(means, clusters, data_points):
final_means = list()
final_clusters = list()
for point in data_points:
smallest_distance = None
# get the mean this point is closest to
for i in range(len(means)):
this_dist = get_magnitude(get_distance(point, means[i]))
if smallest_distance == None or this_dist < smallest_distance["distance"]:
smallest_distance = {}
smallest_distance["index"] = i
smallest_distance["distance"] = this_dist
# add the point to the cluster and store the previous cluster length
selected_cluster_i = smallest_distance["index"]
clusters[selected_cluster_i].append(point)
cluster_len = len(clusters[selected_cluster_i]) - 1
# recalculate this clusters meme (accounting for the new point just added)
for key, prev_val in means[selected_cluster_i]["data"].items():
means[selected_cluster_i]["data"][key] = prev_val*(cluster_len - 1)
means[selected_cluster_i] = add_points(means[selected_cluster_i], point)
for key, prev_val in means[selected_cluster_i]["data"].items():
means[selected_cluster_i]["data"][key] /= len(clusters[selected_cluster_i])
return means, clusters
users = read_data("data.json")
# perform k means (it will select initial means and then
# classify all remaining non-means)
init_means, clusters = get_inital_means(users, 6)
# write resulting clusters to results.txt
result_file = open("results.txt", "w")
for cluster in clusters:
result_file.write("-------------------------\n")
result_file.write("Cluster: \n")
for point in cluster:
result_file.write("User: " + str(point["id"]) + "\t\t")
for key in point["data"].keys():
result_file.write(key + ": " + str(point["data"][key]))
result_file.write("\t\t")
result_file.write("\n\n")
result_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment