jiobu1/nn.py

## nn.py
# Nearest Neighbor

import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Select columns that will be used to calculate nearest neighbors
X = merged[[
       'TotalPop', 'Men', 'Women', 'Hispanic', 'White',
       'Black', 'Native', 'Asian', 'Pacific', 'Diversity Index',
       'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr',
       'Poverty', 'ChildPoverty',
       'Employed', 'Unemployment',
       'PrivateWork', 'PublicWork', 'SelfEmployed','FamilyWork',
       'Professional', 'Service', 'Office', 'Construction','Production',
       'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp', 'WorkAtHome', 'MeanCommute',
       'Violent crime', 'Murder and nonnegligent manslaughter', 'Rape',
       'Robbery', 'Aggravated assault', 'Property crime', 'Burglary',
       'Larceny- theft', 'Motor vehicle theft', 'Arson', 'Crime Rate per 1000',
       'Rent',
       'Days with AQI','Good Days', 'Moderate Days',
       'Unhealthy for Sensitive Groups Days',
       'Unhealthy Days', 'Very Unhealthy Days', 'Hazardous Days',
       'Max AQI', '90th Percentile AQI', 'Median AQI', 'Days CO',
       'Days NO2','Days Ozone', 'Days SO2', 'Days PM2.5', 'Days PM10'
       ]]

# scaling data since the values are not the same, i.e 1000s for rent and populutaion versus numbers shown as percentages
scaler = StandardScaler()
standard_df = scaler.fit_transform(X)
standard_df = pd.DataFrame(standard_df, columns = X.columns)

# instantiate nearest neighbors algorithm and fit on scaled df
nn = NearestNeighbors(n_neighbors=6, algorithm='kd_tree', n_jobs=8)
nn.fit(standard_df)

# function to join list of nearest neighbors by id
def nearest(idx):
    return ','.join(map(str, nn.kneighbors([standard_df.iloc[idx]])[1][0][1:].tolist()))

# Apply function to each row of merged dataframe and create a Nearest column
merged['Index'] = merged.index
merged['Nearest'] = merged['Index'].apply(nearest)
	# Nearest Neighbor

	import numpy as np
	from sklearn.neighbors import NearestNeighbors
	from sklearn.preprocessing import StandardScaler
	from sklearn.pipeline import make_pipeline

	# Select columns that will be used to calculate nearest neighbors
	X = merged[[
	'TotalPop', 'Men', 'Women', 'Hispanic', 'White',
	'Black', 'Native', 'Asian', 'Pacific', 'Diversity Index',
	'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr',
	'Poverty', 'ChildPoverty',
	'Employed', 'Unemployment',
	'PrivateWork', 'PublicWork', 'SelfEmployed','FamilyWork',
	'Professional', 'Service', 'Office', 'Construction','Production',
	'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp', 'WorkAtHome', 'MeanCommute',
	'Violent crime', 'Murder and nonnegligent manslaughter', 'Rape',
	'Robbery', 'Aggravated assault', 'Property crime', 'Burglary',
	'Larceny- theft', 'Motor vehicle theft', 'Arson', 'Crime Rate per 1000',
	'Rent',
	'Days with AQI','Good Days', 'Moderate Days',
	'Unhealthy for Sensitive Groups Days',
	'Unhealthy Days', 'Very Unhealthy Days', 'Hazardous Days',
	'Max AQI', '90th Percentile AQI', 'Median AQI', 'Days CO',
	'Days NO2','Days Ozone', 'Days SO2', 'Days PM2.5', 'Days PM10'
	]]

	# scaling data since the values are not the same, i.e 1000s for rent and populutaion versus numbers shown as percentages
	scaler = StandardScaler()
	standard_df = scaler.fit_transform(X)
	standard_df = pd.DataFrame(standard_df, columns = X.columns)

	# instantiate nearest neighbors algorithm and fit on scaled df
	nn = NearestNeighbors(n_neighbors=6, algorithm='kd_tree', n_jobs=8)
	nn.fit(standard_df)

	# function to join list of nearest neighbors by id
	def nearest(idx):
	return ','.join(map(str, nn.kneighbors([standard_df.iloc[idx]])[1][0][1:].tolist()))

	# Apply function to each row of merged dataframe and create a Nearest column
	merged['Index'] = merged.index
	merged['Nearest'] = merged['Index'].apply(nearest)