toinetoine/kmeans.py

## kmeans.py
import json

# read data from the json file
def read_data(filename):

  # open the file and read data
  with open(filename) as data_file:
      data = json.load(data_file)

      # for each item in the file add their id and data as a
      # point in the points list
      points = list()
      for item in data["dataset"]:
        point = {}
        point["id"] = item["id"]
        point["data"] = {}
        for dimention in item["data"]:
          point["data"][dimention] = 1.0
        points.append(point)

      return points

def get_magnitude(point):
  magnitude = 0.0
  data = point["data"]
  for key in list(data.keys()):
    magnitude += data[key]
  return magnitude

# return distance between point_a and point_b as a scalar (absolute value)
def get_distance(point_a, point_b):
  a_data = point_a["data"]
  b_data = point_b["data"]

  result_point = {}
  result_point["id"] = "|" + point_a["id"] + "-" + point_b["id"] + "|"
  result_point["data"] = {}

  # for data in both a and b AND all data in a but not in b
  for a_key in list(a_data.keys()):
    if a_key not in list(b_data.keys()):
      result_point["data"][a_key] = 1.0
    else:
      result_point["data"][a_key] = abs(a_data[a_key] - b_data[a_key])
  # for data in b but not in a
  for b_key in list(b_data.keys()):
    if b_key not in list(a_data.keys()):
      result_point["data"][b_key] = 1.0

  return result_point

# return a points that's the result of point_a + point_b
def add_points(point_a, point_b):
  a_data = point_a["data"]
  b_data = point_b["data"]

  result_point = {}
  result_point["id"] = point_a["id"] + "+" + point_b["id"]
  result_point["data"] = {}
  # for data in both a and b AND all data in a but not in b
  for a_key in list(a_data.keys()):
    if a_key not in list(b_data.keys()):
      result_point["data"][a_key] = a_data[a_key]
    else:
      result_point["data"][a_key] = (a_data[a_key] + b_data[a_key])
  # for data in b but not in a
  for b_key in list(b_data.keys()):
    if b_key not in list(a_data.keys()):
      result_point["data"][b_key] = b_data[b_key]

  return result_point

# get the initial means
def get_inital_means(points, k):
  # assert that there are more points than value k
  assert (len(points) >= k)

  ref_point = points[0]
  distances = list()

  # get the largest distance from the ref_point to another point
  largest_distance = None
  for p_i in range(len(points)):
    distance = get_magnitude(get_distance(ref_point, points[p_i]))

    if(largest_distance == None or distance > largest_distance):
      largest_distance = distance

  means = list()
  non_means = points[:]
  for k_i in range(k):
    # get closest distance this interval of distance from the refrence point
    closest_distance = None
    p_i = 0
    while(p_i < len(non_means)):
      distance = get_magnitude(get_distance(ref_point, non_means[p_i]))
      difference = abs(k_i*largest_distance/k - distance)
      if closest_distance == None or difference < closest_distance["difference"]:
        closest_distance = {}
        closest_distance["difference"] = difference
        closest_distance["point_index"] = p_i
      p_i += 1

    # add the closest point to the means list
    means.append(non_means.pop(closest_distance["point_index"]))

  # initially clusters only have the initial means in each
  clusters = [list() for i in range(len(means))]
  for i in range(len(means)): clusters[i].append(means[i])

  # classify the rest of the data (the data that wasn't used as initial means)
  final_means, clusters = classify(means, clusters, non_means)

  return final_means, clusters

# classify each of the data points in data_points
# assumes there is an existing mean for each cluster
def classify(means, clusters, data_points):
  final_means = list()
  final_clusters = list()

  for point in data_points:
    smallest_distance = None

    # get the mean this point is closest to
    for i in range(len(means)):
      this_dist = get_magnitude(get_distance(point, means[i]))
      if smallest_distance == None or this_dist < smallest_distance["distance"]:
        smallest_distance = {}
        smallest_distance["index"] = i
        smallest_distance["distance"] = this_dist

    # add the point to the cluster and store the previous cluster length
    selected_cluster_i = smallest_distance["index"]
    clusters[selected_cluster_i].append(point)
    cluster_len = len(clusters[selected_cluster_i]) - 1

    # recalculate this clusters meme (accounting for the new point just added)
    for key, prev_val in means[selected_cluster_i]["data"].items():
      means[selected_cluster_i]["data"][key] = prev_val*(cluster_len - 1)
    means[selected_cluster_i] = add_points(means[selected_cluster_i], point)
    for key, prev_val in means[selected_cluster_i]["data"].items():
      means[selected_cluster_i]["data"][key] /= len(clusters[selected_cluster_i])

  return means, clusters

users = read_data("data.json")

# perform k means (it will select initial means and then
# classify all remaining non-means)
init_means, clusters = get_inital_means(users, 6)

# write resulting clusters to results.txt
result_file = open("results.txt", "w")
for cluster in clusters:
  result_file.write("-------------------------\n")
  result_file.write("Cluster: \n")
  for point in cluster:
    result_file.write("User: " + str(point["id"]) + "\t\t")
    for key in point["data"].keys():
      result_file.write(key + ": " + str(point["data"][key]))
      result_file.write("\t\t")
    result_file.write("\n\n")
result_file.close()
	import json

	# read data from the json file
	def read_data(filename):

	# open the file and read data
	with open(filename) as data_file:
	data = json.load(data_file)

	# for each item in the file add their id and data as a
	# point in the points list
	points = list()
	for item in data["dataset"]:
	point = {}
	point["id"] = item["id"]
	point["data"] = {}
	for dimention in item["data"]:
	point["data"][dimention] = 1.0
	points.append(point)

	return points

	def get_magnitude(point):
	magnitude = 0.0
	data = point["data"]
	for key in list(data.keys()):
	magnitude += data[key]
	return magnitude

	# return distance between point_a and point_b as a scalar (absolute value)
	def get_distance(point_a, point_b):
	a_data = point_a["data"]
	b_data = point_b["data"]

	result_point = {}
	result_point["id"] = "\|" + point_a["id"] + "-" + point_b["id"] + "\|"
	result_point["data"] = {}

	# for data in both a and b AND all data in a but not in b
	for a_key in list(a_data.keys()):
	if a_key not in list(b_data.keys()):
	result_point["data"][a_key] = 1.0
	else:
	result_point["data"][a_key] = abs(a_data[a_key] - b_data[a_key])
	# for data in b but not in a
	for b_key in list(b_data.keys()):
	if b_key not in list(a_data.keys()):
	result_point["data"][b_key] = 1.0

	return result_point

	# return a points that's the result of point_a + point_b
	def add_points(point_a, point_b):
	a_data = point_a["data"]
	b_data = point_b["data"]

	result_point = {}
	result_point["id"] = point_a["id"] + "+" + point_b["id"]
	result_point["data"] = {}
	# for data in both a and b AND all data in a but not in b
	for a_key in list(a_data.keys()):
	if a_key not in list(b_data.keys()):
	result_point["data"][a_key] = a_data[a_key]
	else:
	result_point["data"][a_key] = (a_data[a_key] + b_data[a_key])
	# for data in b but not in a
	for b_key in list(b_data.keys()):
	if b_key not in list(a_data.keys()):
	result_point["data"][b_key] = b_data[b_key]

	return result_point

	# get the initial means
	def get_inital_means(points, k):
	# assert that there are more points than value k
	assert (len(points) >= k)

	ref_point = points[0]
	distances = list()

	# get the largest distance from the ref_point to another point
	largest_distance = None
	for p_i in range(len(points)):
	distance = get_magnitude(get_distance(ref_point, points[p_i]))

	if(largest_distance == None or distance > largest_distance):
	largest_distance = distance

	means = list()
	non_means = points[:]
	for k_i in range(k):
	# get closest distance this interval of distance from the refrence point
	closest_distance = None
	p_i = 0
	while(p_i < len(non_means)):
	distance = get_magnitude(get_distance(ref_point, non_means[p_i]))
	difference = abs(k_i*largest_distance/k - distance)
	if closest_distance == None or difference < closest_distance["difference"]:
	closest_distance = {}
	closest_distance["difference"] = difference
	closest_distance["point_index"] = p_i
	p_i += 1

	# add the closest point to the means list
	means.append(non_means.pop(closest_distance["point_index"]))

	# initially clusters only have the initial means in each
	clusters = [list() for i in range(len(means))]
	for i in range(len(means)): clusters[i].append(means[i])

	# classify the rest of the data (the data that wasn't used as initial means)
	final_means, clusters = classify(means, clusters, non_means)

	return final_means, clusters

	# classify each of the data points in data_points
	# assumes there is an existing mean for each cluster
	def classify(means, clusters, data_points):
	final_means = list()
	final_clusters = list()

	for point in data_points:
	smallest_distance = None

	# get the mean this point is closest to
	for i in range(len(means)):
	this_dist = get_magnitude(get_distance(point, means[i]))
	if smallest_distance == None or this_dist < smallest_distance["distance"]:
	smallest_distance = {}
	smallest_distance["index"] = i
	smallest_distance["distance"] = this_dist

	# add the point to the cluster and store the previous cluster length
	selected_cluster_i = smallest_distance["index"]
	clusters[selected_cluster_i].append(point)
	cluster_len = len(clusters[selected_cluster_i]) - 1

	# recalculate this clusters meme (accounting for the new point just added)
	for key, prev_val in means[selected_cluster_i]["data"].items():
	means[selected_cluster_i]["data"][key] = prev_val*(cluster_len - 1)
	means[selected_cluster_i] = add_points(means[selected_cluster_i], point)
	for key, prev_val in means[selected_cluster_i]["data"].items():
	means[selected_cluster_i]["data"][key] /= len(clusters[selected_cluster_i])

	return means, clusters

	users = read_data("data.json")

	# perform k means (it will select initial means and then
	# classify all remaining non-means)
	init_means, clusters = get_inital_means(users, 6)

	# write resulting clusters to results.txt
	result_file = open("results.txt", "w")
	for cluster in clusters:
	result_file.write("-------------------------\n")
	result_file.write("Cluster: \n")
	for point in cluster:
	result_file.write("User: " + str(point["id"]) + "\t\t")
	for key in point["data"].keys():
	result_file.write(key + ": " + str(point["data"][key]))
	result_file.write("\t\t")
	result_file.write("\n\n")
	result_file.close()