Created
September 15, 2015 16:17
-
-
Save toinetoine/eb1f76efdea2d319e30d to your computer and use it in GitHub Desktop.
kmeans clafficiation with evenly-distributed initial mean selection for user pages liked data for http://stats.stackexchange.com/questions/167922/classify-users-by-the-pages-they-liked
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
# read data from the json file | |
def read_data(filename): | |
# open the file and read data | |
with open(filename) as data_file: | |
data = json.load(data_file) | |
# for each item in the file add their id and data as a | |
# point in the points list | |
points = list() | |
for item in data["dataset"]: | |
point = {} | |
point["id"] = item["id"] | |
point["data"] = {} | |
for dimention in item["data"]: | |
point["data"][dimention] = 1.0 | |
points.append(point) | |
return points | |
def get_magnitude(point): | |
magnitude = 0.0 | |
data = point["data"] | |
for key in list(data.keys()): | |
magnitude += data[key] | |
return magnitude | |
# return distance between point_a and point_b as a scalar (absolute value) | |
def get_distance(point_a, point_b): | |
a_data = point_a["data"] | |
b_data = point_b["data"] | |
result_point = {} | |
result_point["id"] = "|" + point_a["id"] + "-" + point_b["id"] + "|" | |
result_point["data"] = {} | |
# for data in both a and b AND all data in a but not in b | |
for a_key in list(a_data.keys()): | |
if a_key not in list(b_data.keys()): | |
result_point["data"][a_key] = 1.0 | |
else: | |
result_point["data"][a_key] = abs(a_data[a_key] - b_data[a_key]) | |
# for data in b but not in a | |
for b_key in list(b_data.keys()): | |
if b_key not in list(a_data.keys()): | |
result_point["data"][b_key] = 1.0 | |
return result_point | |
# return a points that's the result of point_a + point_b | |
def add_points(point_a, point_b): | |
a_data = point_a["data"] | |
b_data = point_b["data"] | |
result_point = {} | |
result_point["id"] = point_a["id"] + "+" + point_b["id"] | |
result_point["data"] = {} | |
# for data in both a and b AND all data in a but not in b | |
for a_key in list(a_data.keys()): | |
if a_key not in list(b_data.keys()): | |
result_point["data"][a_key] = a_data[a_key] | |
else: | |
result_point["data"][a_key] = (a_data[a_key] + b_data[a_key]) | |
# for data in b but not in a | |
for b_key in list(b_data.keys()): | |
if b_key not in list(a_data.keys()): | |
result_point["data"][b_key] = b_data[b_key] | |
return result_point | |
# get the initial means | |
def get_inital_means(points, k): | |
# assert that there are more points than value k | |
assert (len(points) >= k) | |
ref_point = points[0] | |
distances = list() | |
# get the largest distance from the ref_point to another point | |
largest_distance = None | |
for p_i in range(len(points)): | |
distance = get_magnitude(get_distance(ref_point, points[p_i])) | |
if(largest_distance == None or distance > largest_distance): | |
largest_distance = distance | |
means = list() | |
non_means = points[:] | |
for k_i in range(k): | |
# get closest distance this interval of distance from the refrence point | |
closest_distance = None | |
p_i = 0 | |
while(p_i < len(non_means)): | |
distance = get_magnitude(get_distance(ref_point, non_means[p_i])) | |
difference = abs(k_i*largest_distance/k - distance) | |
if closest_distance == None or difference < closest_distance["difference"]: | |
closest_distance = {} | |
closest_distance["difference"] = difference | |
closest_distance["point_index"] = p_i | |
p_i += 1 | |
# add the closest point to the means list | |
means.append(non_means.pop(closest_distance["point_index"])) | |
# initially clusters only have the initial means in each | |
clusters = [list() for i in range(len(means))] | |
for i in range(len(means)): clusters[i].append(means[i]) | |
# classify the rest of the data (the data that wasn't used as initial means) | |
final_means, clusters = classify(means, clusters, non_means) | |
return final_means, clusters | |
# classify each of the data points in data_points | |
# assumes there is an existing mean for each cluster | |
def classify(means, clusters, data_points): | |
final_means = list() | |
final_clusters = list() | |
for point in data_points: | |
smallest_distance = None | |
# get the mean this point is closest to | |
for i in range(len(means)): | |
this_dist = get_magnitude(get_distance(point, means[i])) | |
if smallest_distance == None or this_dist < smallest_distance["distance"]: | |
smallest_distance = {} | |
smallest_distance["index"] = i | |
smallest_distance["distance"] = this_dist | |
# add the point to the cluster and store the previous cluster length | |
selected_cluster_i = smallest_distance["index"] | |
clusters[selected_cluster_i].append(point) | |
cluster_len = len(clusters[selected_cluster_i]) - 1 | |
# recalculate this clusters meme (accounting for the new point just added) | |
for key, prev_val in means[selected_cluster_i]["data"].items(): | |
means[selected_cluster_i]["data"][key] = prev_val*(cluster_len - 1) | |
means[selected_cluster_i] = add_points(means[selected_cluster_i], point) | |
for key, prev_val in means[selected_cluster_i]["data"].items(): | |
means[selected_cluster_i]["data"][key] /= len(clusters[selected_cluster_i]) | |
return means, clusters | |
users = read_data("data.json") | |
# perform k means (it will select initial means and then | |
# classify all remaining non-means) | |
init_means, clusters = get_inital_means(users, 6) | |
# write resulting clusters to results.txt | |
result_file = open("results.txt", "w") | |
for cluster in clusters: | |
result_file.write("-------------------------\n") | |
result_file.write("Cluster: \n") | |
for point in cluster: | |
result_file.write("User: " + str(point["id"]) + "\t\t") | |
for key in point["data"].keys(): | |
result_file.write(key + ": " + str(point["data"][key])) | |
result_file.write("\t\t") | |
result_file.write("\n\n") | |
result_file.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment