Skip to content

Instantly share code, notes, and snippets.

@jiobu1
Last active April 26, 2021 00:43
Show Gist options
  • Save jiobu1/719a1016f6e6c6fcd994d04115dd80be to your computer and use it in GitHub Desktop.
Save jiobu1/719a1016f6e6c6fcd994d04115dd80be to your computer and use it in GitHub Desktop.
Nearest Neigbor algorithm
# Nearest Neighbor
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
# Select columns that will be used to calculate nearest neighbors
X = merged[[
'TotalPop', 'Men', 'Women', 'Hispanic', 'White',
'Black', 'Native', 'Asian', 'Pacific', 'Diversity Index',
'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr',
'Poverty', 'ChildPoverty',
'Employed', 'Unemployment',
'PrivateWork', 'PublicWork', 'SelfEmployed','FamilyWork',
'Professional', 'Service', 'Office', 'Construction','Production',
'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp', 'WorkAtHome', 'MeanCommute',
'Violent crime', 'Murder and nonnegligent manslaughter', 'Rape',
'Robbery', 'Aggravated assault', 'Property crime', 'Burglary',
'Larceny- theft', 'Motor vehicle theft', 'Arson', 'Crime Rate per 1000',
'Rent',
'Days with AQI','Good Days', 'Moderate Days',
'Unhealthy for Sensitive Groups Days',
'Unhealthy Days', 'Very Unhealthy Days', 'Hazardous Days',
'Max AQI', '90th Percentile AQI', 'Median AQI', 'Days CO',
'Days NO2','Days Ozone', 'Days SO2', 'Days PM2.5', 'Days PM10'
]]
# scaling data since the values are not the same, i.e 1000s for rent and populutaion versus numbers shown as percentages
scaler = StandardScaler()
standard_df = scaler.fit_transform(X)
standard_df = pd.DataFrame(standard_df, columns = X.columns)
# instantiate nearest neighbors algorithm and fit on scaled df
nn = NearestNeighbors(n_neighbors=6, algorithm='kd_tree', n_jobs=8)
nn.fit(standard_df)
# function to join list of nearest neighbors by id
def nearest(idx):
return ','.join(map(str, nn.kneighbors([standard_df.iloc[idx]])[1][0][1:].tolist()))
# Apply function to each row of merged dataframe and create a Nearest column
merged['Index'] = merged.index
merged['Nearest'] = merged['Index'].apply(nearest)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment