Last active
September 7, 2018 22:59
-
-
Save HariSan1/01971fb041f489f1f2ccfc99c3c32a06 to your computer and use it in GitHub Desktop.
LA restaurant inspection analysis, EDA, and mapping for visualization
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#TSB | |
#Hari Santanam, 2018 | |
#import tools required to run the code | |
from mpl_toolkits.mplot3d import Axes3D | |
from sklearn.preprocessing import StandardScaler | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import os | |
import pandas as pd | |
import requests | |
import logging | |
import time | |
import seaborn as sns | |
#nRowsRead = 100 - use this if you only want to test the waters and not import the whole file, remember to uncomment until the hyphen :) | |
#read in file with all inspections and put in dataframe | |
df1 = pd.read_csv('../input/restaurant-and-market-health-inspections.csv', delimiter=',') | |
nRow, nCol = df1.shape | |
#print number of rows and columns | |
print({nRow}, {nCol}) | |
#read in file with all health VIOLATIONS and put in dataframe | |
df2 = pd.read_csv('../input/restaurant-and-market-health-violations.csv') | |
from matplotlib import reload | |
reload(plt) | |
%matplotlib notebook | |
df1['score']=df1['score'].astype(int) #make score of type INTEGER | |
score_dist_hist = df1['score'].plot(kind = "hist", title = "Score Distribution", figsize = (4,3), alpha = 0.5) | |
score_dist_hist.set_xlabel("score - values") | |
df2['score'].hist().plot() | |
#print the top 10 rows of the violations dataframe (df2) | |
df2.head(10) | |
#group by restaurant type and count # in each category | |
#then sort from highest # to lowest, then create a bar graph | |
reload(plt) | |
%matplotlib notebook | |
temp = df1.groupby('pe_description').size() | |
description_distribution = pd.DataFrame({'Count':temp.values}, index=temp.index) | |
description_distribution = description_distribution.sort_values(by=['Count'], ascending=True) | |
df2['pe_description'].hist().plot() | |
#the previous charts and graphs show breakdown of various types food restaurants with risk | |
#broken down to high, medium, low. | |
#This procedure use the split function to break the pe_description field into the sub-string | |
#after the 2nd space from the end - ex: string x = "Aa Bb Cc", when split applied like this: x.split(' ')[-2] ->sub-string after(not before) 2nd space '=>Bb' | |
def sub_risk(x): | |
return x.split(' ')[-2] | |
df2['risk'] = df2['pe_description'].astype(str).apply(sub_risk) #apply function to get only high, medium, low | |
temp = df2.groupby('risk').size() #group, count by risk level | |
#plot the histogram for the 3 levels of risk | |
reload(plt) | |
%matplotlib notebook | |
df2['risk'].hist().plot() | |
#calculate and plot pie chart for risk | |
risk_distribution = pd.DataFrame({'Count':temp.values}, index = temp.index) | |
risk_distribution | |
ax2 = risk_distribution['Count'].plot(kind="pie", legend=True,autopct='%.2f', figsize=(6, 6)) | |
#show first 10 rows of the violations file dataframe | |
df2.head(10) | |
#groupb by violation_description, count and sort them from largest violation by count to smallest | |
violation_description = df2.groupby('violation_description').size() | |
pd.DataFrame({'Count':violation_description.values},index = violation_description.index).sort_values(by = 'Count',ascending=False) | |
df2['pe_description'].value_counts() | |
#create a simple proc for heat map for risk - low, moderate, high | |
def convert_risk_value(x): | |
if x == 'LOW': | |
return 10 | |
elif x == 'MODERATE': | |
return 5 | |
else: | |
return 0 | |
#create simple proc to map grade to value | |
def convert_grade_value(x): | |
if x == 'A': | |
return 10 | |
elif x == 'B': | |
return 5 | |
else: | |
return 0 | |
#call (apply) procs created above | |
df2['risk_value']=df2['risk'].apply(convert_risk_value) | |
df2['grade_value']=df2['grade'].apply(convert_grade_value) | |
df3 = df2.loc[:,['score', 'grade_value', 'risk_value']] | |
corr = df3.corr() | |
corr = (corr) | |
reload(plt) | |
%matplotlib notebook | |
sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels=corr.columns.values, cmap="Purples", center=0) | |
#top 20 facilities with most restaurants / markets | |
facility_dist = df1.groupby(['facility_id', 'facility_name']).size() | |
top20_facilities = facility_dist.sort_values(ascending=False).head(20) | |
pd.DataFrame({'Count':top20_facilities.values}, index=top20_facilities.index) | |
#top 30 owners with most restaurants / markets | |
owner_dist = df1.groupby(['owner_id', 'owner_name']).size() | |
top30_owners = owner_dist.sort_values(ascending=False).head(30) | |
pd.DataFrame({'Count':top30_owners.values}, index= top30_owners.index) | |
#violations listing from most common, descending - violation description, violation code, counts | |
violation_desc=df2.groupby(['violation_description','violation_code']).size() | |
pd.DataFrame({'Count':violation_desc.values}, index=violation_desc.index).sort_values(by = 'Count', ascending=False) | |
#list facilities with most violations and type of violation | |
#create a dataframe with facility and violation columns, aggregate by size, then count and sort them | |
violation_desc2 = df2.groupby(['facility_name','violation_description']).size() | |
pd.DataFrame({'Count':violation_desc2.values}, index=violation_desc2.index).sort_values(by='Count', ascending=False) | |
df1.head(10) | |
#get a list of all the restaurants with grade C | |
df4 = df2.loc[(df2['grade'] == 'C'),['facility_name','facility_address','facility_zip']] | |
df4=df4.drop_duplicates(['facility_name']) #only want each restaurant listed once, since many of them have multiple violations | |
df4 | |
#visualize bad restaurants (grade C)on a map, so that if you are in that area, you can avoid them :) | |
#some of them might have remediated their violations, or may be operating under "new management" or maybe even turned over a new leaf - we just don't know | |
addresses_to_avoid = df4['facility_address'].tolist() | |
addresses_to_avoid = (df4['facility_name'] + ',' + df4['facility_address'] + ',' + 'LOS ANGELES' + ',CA').tolist() | |
#filter out by specific violation - in this case I picked Impoundment of unsanitary equipment or food - and list violators | |
df2.loc[(df2['violation_description'] == "# 50. Impoundment of unsanitary equipment or food") & (df2['violation_status']=="OUT OF COMPLIANCE"), ['facility_name','facility_address', 'violation_description','activity_date']] | |
addresses_for_violation_50= df2.loc[(df2['violation_description'] == "# 50. Impoundment of unsanitary equipment or food") & (df2['violation_status']=="OUT OF COMPLIANCE"), ['facility_name','facility_address']] | |
print(addresses_for_violation_50) | |
#this cell creates and prints breakdown list of violators for each violation code | |
#first, get a unique list of violations - .unique() | |
#then, use a for loop to go from index 0 to the max number of items [index] for violation codes | |
#for each unique violation, get a list of offenders | |
temp1 = df2.violation_description.unique() | |
print(len(temp1)) #print the total number of UNIQUE violations in the database | |
for i in range(0,len(temp1)): #for each one, get the list of restaurants that match it in dataset | |
temp2 = df2.loc[(df2['violation_description'] == temp1[i]) & (df2['violation_status']=="OUT OF COMPLIANCE"), ['violation_code','facility_name','facility_address']] | |
print(temp2) | |
# Correlation matrix | |
def plotCorrelationMatrix(df, graphWidth): | |
filename = df.dataframeName | |
df = df.dropna('columns') # drop columns with NaN | |
df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values | |
if df.shape[1] < 2: | |
printf('No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2') | |
return | |
corr = df.corr() | |
plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k') | |
corrMat = plt.matshow(corr, fignum = 1) | |
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90) | |
plt.yticks(range(len(corr.columns)), corr.columns) | |
plt.gca().xaxis.tick_bottom() | |
plt.colorbar(corrMat) | |
plt.title('Correlation Matrix for {filename}', fontsize=15) | |
plt.show() | |
# Scatter and density plots | |
def plotScatterMatrix(df, plotSize, textSize): | |
df = df.select_dtypes(include =[np.number]) # keep only numerical columns | |
# Remove rows and columns that would lead to df being singular | |
df = df.dropna('columns') | |
df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values | |
columnNames = list(df) | |
if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots | |
columnNames = columnNames[:10] | |
df = df[columnNames] | |
ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde') | |
corrs = df.corr().values | |
for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)): | |
ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize) | |
plt.suptitle('Scatter and Density Plot') | |
plt.show() | |
df1.dataframeName = 'restaurant-and-market-health-inspections.csv' | |
df2.dataframeName = 'restaurant-and-market-health-violations.csv' | |
plotCorrelationMatrix(df2, 8) | |
plotScatterMatrix(df1, 9, 10) | |
address_column_name = 'facility_address' | |
restaurant_name = 'facility_name' | |
RETURN_FULL_RESULTS = False | |
BACKOFF_TIME = 30 | |
logger = logging.getLogger("root") | |
logger.setLevel(logging.DEBUG) | |
ch = logging.StreamHandler() #console handler | |
ch.setLevel(logging.DEBUG) | |
logger.addHandler(ch) | |
API_KEY = 'your API key here' | |
output_filename = '../output/output-2018.csv' | |
#print(addresses) | |
#adapted from Shane Lynn - thanks | |
def get_google_results(address, api_key=None, return_full_response=False): | |
geocode_url = "https://maps.googleapis.com/maps/api/geocode/json?address={}".format(address) | |
if api_key is not None: | |
geocode_url = geocode_url + "&key={}".format(api_key) | |
#ping google for the results: | |
results = requests.get(geocode_url) | |
results = results.json() | |
if len(results['results']) == 0: | |
output = { | |
"formatted_address" : None, | |
"latitude": None, | |
"longitude": None, | |
"accuracy": None, | |
"google_place_id": None, | |
"type": None, | |
"postcode": None | |
} | |
else: | |
answer = results['results'][0] | |
output = { | |
"formatted_address" : answer.get('formatted_address'), | |
"latitude": answer.get('geometry').get('location').get('lat'), | |
"longitude": answer.get('geometry').get('location').get('lng'), | |
"accuracy": answer.get('geometry').get('location_type'), | |
"google_place_id": answer.get("place_id"), | |
"type": ",".join(answer.get('types')), | |
"postcode": ",".join([x['long_name'] for x in answer.get('address_components') | |
if 'postal_code' in x.get('types')]) | |
} | |
#append some other details | |
output['input_string'] = address | |
output['number_of_results'] = len(results['results']) | |
output['status'] = results.get('status') | |
if return_full_response is True: | |
output['response'] = results | |
return output | |
#test that API key validity, internet access confirmation, and function result are all good | |
test_result = get_google_results("HABITAT COFFEE SHOP, 3708 N EAGLE ROCK BLVD, LOS ANGELES, CA", API_KEY, RETURN_FULL_RESULTS) | |
print(test_result) | |
results2=[] | |
for address in addresses_to_avoid: | |
geocode_result = get_google_results(address, API_KEY, return_full_response=RETURN_FULL_RESULTS) | |
results2.append(geocode_result) | |
pd.DataFrame(results2).to_csv('../output/restaurants_to_avoid.csv', encoding='utf8') | |
#mapbox map addresses to avoid | |
import plotly.graph_objs as go | |
import plotly.offline as offline | |
import pandas as pd | |
#This is the mapbox API token (different from the google token used in the main restaurant code - that provided the lat, long coordinate for each specified address) | |
#that will be used by the website to validate user access and map the latitude, longitude | |
mapbox_token = "Insert your mapbox token here" | |
#modify the path with your path as needed | |
#df = pd.read_csv('../output-2018.csv') | |
df = pd.read_csv('../output/restaurants_to_avoid.csv') | |
df.drop(['formatted_address'], axis=1) | |
#data parameters from the file to map | |
trace = go.Scattermapbox(lat = df["latitude"], lon = df["longitude"], text= df["input_string"], marker=go.Marker(size=10), mode="markers+text", textposition="top") | |
#34.0292265,-118.4639102 | |
data = [trace] | |
#define map layout - access token (API), which location map should be centered on | |
layout = go.Layout(mapbox=dict(accesstoken=mapbox_token, center=dict(lat=34.03, lon=-118.46), zoom=11.0)) | |
#finalize data and layout | |
figure = go.Figure(data = data, layout = layout) | |
#draw the map already! NOTE: map will open in your default browser - new tab | |
offline.plot(figure) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment