Skip to content

Instantly share code, notes, and snippets.

@schwartzadev
Created June 1, 2019 21:44
Show Gist options
  • Save schwartzadev/da2c8e655f383da63f06e81fdcc49851 to your computer and use it in GitHub Desktop.
Save schwartzadev/da2c8e655f383da63f06e81fdcc49851 to your computer and use it in GitHub Desktop.
A KNN Algorithm to recommend similar schools to an given school, inputted by a user
import json
import numpy as np
from sklearn.neighbors import NearestNeighbors
import pandas as pd
"""
schoolInfo.json is from https://www.kaggle.com/theriley106/university-statistics
"""
def get_numpy_array():
with open('schoolInfo.json') as f:
content = f.read()
json_data = json.loads(content)
return np.array(json_data)
with open('schoolInfo.json') as f:
content = f.read()
json_data = json.loads(content)
def get_dataframe_from_json(json):
df = pd.io.json.json_normalize(json)
df = df[[
'displayName',
'acceptance-rate',
'act-avg',
'cost-after-aid',
'enrollment',
'hs-gpa-avg',
'isPublic',
'rankingDisplayScore',
'sat-avg',
'tuition'
]]
df = df.dropna() # drop rows w nulls
df = df.reset_index(drop=True) # fix the indices to run 1, 2, 3, 4, etc.
y = df['displayName'] # results vector
X = df.drop('displayName', axis=1) # everything except the results
X = X.astype(np.float64) # make everything floats
return X, y
X, y = get_dataframe_from_json(json_data)
def get_n_similar_schools(n, school_id):
nbrs = NearestNeighbors(n_neighbors=(n + 1), algorithm='ball_tree').fit(X=X, y=y)
res = nbrs.kneighbors([X.iloc[school_id].values.tolist()])
nearest_indicies = res[1][0]
print('schools similar to {}...'.format(y[school_id]))
for s in nearest_indicies[1:]: # nearest except the input college
print('\t', y[s])
def get_school_id(results_vector):
cont = True
while cont == True:
school_name = input('enter a school name... ')
results = results_vector[results_vector.str.lower().str.contains( school_name.lower() )]
if len(results) == 1:
cont = False # break out of the loop
elif len(results) == 0:
print('"{}" returned no results. Try again.'.format(school_name))
elif len(results) > 1:
print('"{}" returned multiple results (below). Try again.'.format(school_name))
print(results)
school_id = results.index[0]
return school_id
school_id = get_school_id(y)
get_n_similar_schools(5, school_id)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment