Last active
October 24, 2023 00:57
-
-
Save onelharrison/373d81dc21d43c3126f15d2d0867d80a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
import math | |
def knn(data, query, k, distance_fn, choice_fn): | |
neighbor_distances_and_indices = [] | |
# 3. For each example in the data | |
for index, example in enumerate(data): | |
# 3.1 Calculate the distance between the query example and the current | |
# example from the data. | |
distance = distance_fn(example[:-1], query) | |
# 3.2 Add the distance and the index of the example to an ordered collection | |
neighbor_distances_and_indices.append((distance, index)) | |
# 4. Sort the ordered collection of distances and indices from | |
# smallest to largest (in ascending order) by the distances | |
sorted_neighbor_distances_and_indices = sorted(neighbor_distances_and_indices) | |
# 5. Pick the first K entries from the sorted collection | |
k_nearest_distances_and_indices = sorted_neighbor_distances_and_indices[:k] | |
# 6. Get the labels of the selected K entries | |
k_nearest_labels = [data[i][-1] for distance, i in k_nearest_distances_and_indices] | |
# 7. If regression (choice_fn = mean), return the average of the K labels | |
# 8. If classification (choice_fn = mode), return the mode of the K labels | |
return k_nearest_distances_and_indices , choice_fn(k_nearest_labels) | |
def mean(labels): | |
return sum(labels) / len(labels) | |
def mode(labels): | |
return Counter(labels).most_common(1)[0][0] | |
def euclidean_distance(point1, point2): | |
sum_squared_distance = 0 | |
for i in range(len(point1)): | |
sum_squared_distance += math.pow(point1[i] - point2[i], 2) | |
return math.sqrt(sum_squared_distance) | |
def main(): | |
''' | |
# Regression Data | |
# | |
# Column 0: height (inches) | |
# Column 1: weight (pounds) | |
''' | |
reg_data = [ | |
[65.75, 112.99], | |
[71.52, 136.49], | |
[69.40, 153.03], | |
[68.22, 142.34], | |
[67.79, 144.30], | |
[68.70, 123.30], | |
[69.80, 141.49], | |
[70.01, 136.46], | |
[67.90, 112.37], | |
[66.49, 127.45], | |
] | |
# Question: | |
# Given the data we have, what's the best-guess at someone's weight if they are 60 inches tall? | |
reg_query = [60] | |
reg_k_nearest_neighbors, reg_prediction = knn( | |
reg_data, reg_query, k=3, distance_fn=euclidean_distance, choice_fn=mean | |
) | |
''' | |
# Classification Data | |
# | |
# Column 0: age | |
# Column 1: likes pineapple | |
''' | |
clf_data = [ | |
[22, 1], | |
[23, 1], | |
[21, 1], | |
[18, 1], | |
[19, 1], | |
[25, 0], | |
[27, 0], | |
[29, 0], | |
[31, 0], | |
[45, 0], | |
] | |
# Question: | |
# Given the data we have, does a 33 year old like pineapples on their pizza? | |
clf_query = [33] | |
clf_k_nearest_neighbors, clf_prediction = knn( | |
clf_data, clf_query, k=3, distance_fn=euclidean_distance, choice_fn=mode | |
) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@chegeeh and @Olatokunbo360 lol it's not printing 'cause you're not asking it to print anything :P
The first question is: "Given the data we have, what's the best-guess at someone's weight if they are 60 inches tall?"
To get the result, just use:
print(reg_prediction)
in line 68 and you'll see that the result is 128.25 pounds. ;)