Created
June 1, 2019 21:44
-
-
Save schwartzadev/da2c8e655f383da63f06e81fdcc49851 to your computer and use it in GitHub Desktop.
A KNN Algorithm to recommend similar schools to an given school, inputted by a user
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import numpy as np | |
from sklearn.neighbors import NearestNeighbors | |
import pandas as pd | |
""" | |
schoolInfo.json is from https://www.kaggle.com/theriley106/university-statistics | |
""" | |
def get_numpy_array(): | |
with open('schoolInfo.json') as f: | |
content = f.read() | |
json_data = json.loads(content) | |
return np.array(json_data) | |
with open('schoolInfo.json') as f: | |
content = f.read() | |
json_data = json.loads(content) | |
def get_dataframe_from_json(json): | |
df = pd.io.json.json_normalize(json) | |
df = df[[ | |
'displayName', | |
'acceptance-rate', | |
'act-avg', | |
'cost-after-aid', | |
'enrollment', | |
'hs-gpa-avg', | |
'isPublic', | |
'rankingDisplayScore', | |
'sat-avg', | |
'tuition' | |
]] | |
df = df.dropna() # drop rows w nulls | |
df = df.reset_index(drop=True) # fix the indices to run 1, 2, 3, 4, etc. | |
y = df['displayName'] # results vector | |
X = df.drop('displayName', axis=1) # everything except the results | |
X = X.astype(np.float64) # make everything floats | |
return X, y | |
X, y = get_dataframe_from_json(json_data) | |
def get_n_similar_schools(n, school_id): | |
nbrs = NearestNeighbors(n_neighbors=(n + 1), algorithm='ball_tree').fit(X=X, y=y) | |
res = nbrs.kneighbors([X.iloc[school_id].values.tolist()]) | |
nearest_indicies = res[1][0] | |
print('schools similar to {}...'.format(y[school_id])) | |
for s in nearest_indicies[1:]: # nearest except the input college | |
print('\t', y[s]) | |
def get_school_id(results_vector): | |
cont = True | |
while cont == True: | |
school_name = input('enter a school name... ') | |
results = results_vector[results_vector.str.lower().str.contains( school_name.lower() )] | |
if len(results) == 1: | |
cont = False # break out of the loop | |
elif len(results) == 0: | |
print('"{}" returned no results. Try again.'.format(school_name)) | |
elif len(results) > 1: | |
print('"{}" returned multiple results (below). Try again.'.format(school_name)) | |
print(results) | |
school_id = results.index[0] | |
return school_id | |
school_id = get_school_id(y) | |
get_n_similar_schools(5, school_id) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment