Skip to content

Instantly share code, notes, and snippets.

@HariSan1
Last active September 7, 2018 22:59
Show Gist options
  • Save HariSan1/01971fb041f489f1f2ccfc99c3c32a06 to your computer and use it in GitHub Desktop.
Save HariSan1/01971fb041f489f1f2ccfc99c3c32a06 to your computer and use it in GitHub Desktop.
LA restaurant inspection analysis, EDA, and mapping for visualization
#TSB
#Hari Santanam, 2018
#import tools required to run the code
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import requests
import logging
import time
import seaborn as sns
#nRowsRead = 100 - use this if you only want to test the waters and not import the whole file, remember to uncomment until the hyphen :)
#read in file with all inspections and put in dataframe
df1 = pd.read_csv('../input/restaurant-and-market-health-inspections.csv', delimiter=',')
nRow, nCol = df1.shape
#print number of rows and columns
print({nRow}, {nCol})
#read in file with all health VIOLATIONS and put in dataframe
df2 = pd.read_csv('../input/restaurant-and-market-health-violations.csv')
from matplotlib import reload
reload(plt)
%matplotlib notebook
df1['score']=df1['score'].astype(int) #make score of type INTEGER
score_dist_hist = df1['score'].plot(kind = "hist", title = "Score Distribution", figsize = (4,3), alpha = 0.5)
score_dist_hist.set_xlabel("score - values")
df2['score'].hist().plot()
#print the top 10 rows of the violations dataframe (df2)
df2.head(10)
#group by restaurant type and count # in each category
#then sort from highest # to lowest, then create a bar graph
reload(plt)
%matplotlib notebook
temp = df1.groupby('pe_description').size()
description_distribution = pd.DataFrame({'Count':temp.values}, index=temp.index)
description_distribution = description_distribution.sort_values(by=['Count'], ascending=True)
df2['pe_description'].hist().plot()
#the previous charts and graphs show breakdown of various types food restaurants with risk
#broken down to high, medium, low.
#This procedure use the split function to break the pe_description field into the sub-string
#after the 2nd space from the end - ex: string x = "Aa Bb Cc", when split applied like this: x.split(' ')[-2] ->sub-string after(not before) 2nd space '=>Bb'
def sub_risk(x):
return x.split(' ')[-2]
df2['risk'] = df2['pe_description'].astype(str).apply(sub_risk) #apply function to get only high, medium, low
temp = df2.groupby('risk').size() #group, count by risk level
#plot the histogram for the 3 levels of risk
reload(plt)
%matplotlib notebook
df2['risk'].hist().plot()
#calculate and plot pie chart for risk
risk_distribution = pd.DataFrame({'Count':temp.values}, index = temp.index)
risk_distribution
ax2 = risk_distribution['Count'].plot(kind="pie", legend=True,autopct='%.2f', figsize=(6, 6))
#show first 10 rows of the violations file dataframe
df2.head(10)
#groupb by violation_description, count and sort them from largest violation by count to smallest
violation_description = df2.groupby('violation_description').size()
pd.DataFrame({'Count':violation_description.values},index = violation_description.index).sort_values(by = 'Count',ascending=False)
df2['pe_description'].value_counts()
#create a simple proc for heat map for risk - low, moderate, high
def convert_risk_value(x):
if x == 'LOW':
return 10
elif x == 'MODERATE':
return 5
else:
return 0
#create simple proc to map grade to value
def convert_grade_value(x):
if x == 'A':
return 10
elif x == 'B':
return 5
else:
return 0
#call (apply) procs created above
df2['risk_value']=df2['risk'].apply(convert_risk_value)
df2['grade_value']=df2['grade'].apply(convert_grade_value)
df3 = df2.loc[:,['score', 'grade_value', 'risk_value']]
corr = df3.corr()
corr = (corr)
reload(plt)
%matplotlib notebook
sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels=corr.columns.values, cmap="Purples", center=0)
#top 20 facilities with most restaurants / markets
facility_dist = df1.groupby(['facility_id', 'facility_name']).size()
top20_facilities = facility_dist.sort_values(ascending=False).head(20)
pd.DataFrame({'Count':top20_facilities.values}, index=top20_facilities.index)
#top 30 owners with most restaurants / markets
owner_dist = df1.groupby(['owner_id', 'owner_name']).size()
top30_owners = owner_dist.sort_values(ascending=False).head(30)
pd.DataFrame({'Count':top30_owners.values}, index= top30_owners.index)
#violations listing from most common, descending - violation description, violation code, counts
violation_desc=df2.groupby(['violation_description','violation_code']).size()
pd.DataFrame({'Count':violation_desc.values}, index=violation_desc.index).sort_values(by = 'Count', ascending=False)
#list facilities with most violations and type of violation
#create a dataframe with facility and violation columns, aggregate by size, then count and sort them
violation_desc2 = df2.groupby(['facility_name','violation_description']).size()
pd.DataFrame({'Count':violation_desc2.values}, index=violation_desc2.index).sort_values(by='Count', ascending=False)
df1.head(10)
#get a list of all the restaurants with grade C
df4 = df2.loc[(df2['grade'] == 'C'),['facility_name','facility_address','facility_zip']]
df4=df4.drop_duplicates(['facility_name']) #only want each restaurant listed once, since many of them have multiple violations
df4
#visualize bad restaurants (grade C)on a map, so that if you are in that area, you can avoid them :)
#some of them might have remediated their violations, or may be operating under "new management" or maybe even turned over a new leaf - we just don't know
addresses_to_avoid = df4['facility_address'].tolist()
addresses_to_avoid = (df4['facility_name'] + ',' + df4['facility_address'] + ',' + 'LOS ANGELES' + ',CA').tolist()
#filter out by specific violation - in this case I picked Impoundment of unsanitary equipment or food - and list violators
df2.loc[(df2['violation_description'] == "# 50. Impoundment of unsanitary equipment or food") & (df2['violation_status']=="OUT OF COMPLIANCE"), ['facility_name','facility_address', 'violation_description','activity_date']]
addresses_for_violation_50= df2.loc[(df2['violation_description'] == "# 50. Impoundment of unsanitary equipment or food") & (df2['violation_status']=="OUT OF COMPLIANCE"), ['facility_name','facility_address']]
print(addresses_for_violation_50)
#this cell creates and prints breakdown list of violators for each violation code
#first, get a unique list of violations - .unique()
#then, use a for loop to go from index 0 to the max number of items [index] for violation codes
#for each unique violation, get a list of offenders
temp1 = df2.violation_description.unique()
print(len(temp1)) #print the total number of UNIQUE violations in the database
for i in range(0,len(temp1)): #for each one, get the list of restaurants that match it in dataset
temp2 = df2.loc[(df2['violation_description'] == temp1[i]) & (df2['violation_status']=="OUT OF COMPLIANCE"), ['violation_code','facility_name','facility_address']]
print(temp2)
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
filename = df.dataframeName
df = df.dropna('columns') # drop columns with NaN
df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
if df.shape[1] < 2:
printf('No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
return
corr = df.corr()
plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
corrMat = plt.matshow(corr, fignum = 1)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.gca().xaxis.tick_bottom()
plt.colorbar(corrMat)
plt.title('Correlation Matrix for {filename}', fontsize=15)
plt.show()
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
df = df.select_dtypes(include =[np.number]) # keep only numerical columns
# Remove rows and columns that would lead to df being singular
df = df.dropna('columns')
df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
columnNames = list(df)
if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
columnNames = columnNames[:10]
df = df[columnNames]
ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
corrs = df.corr().values
for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
plt.suptitle('Scatter and Density Plot')
plt.show()
df1.dataframeName = 'restaurant-and-market-health-inspections.csv'
df2.dataframeName = 'restaurant-and-market-health-violations.csv'
plotCorrelationMatrix(df2, 8)
plotScatterMatrix(df1, 9, 10)
address_column_name = 'facility_address'
restaurant_name = 'facility_name'
RETURN_FULL_RESULTS = False
BACKOFF_TIME = 30
logger = logging.getLogger("root")
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler() #console handler
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)
API_KEY = 'your API key here'
output_filename = '../output/output-2018.csv'
#print(addresses)
#adapted from Shane Lynn - thanks
def get_google_results(address, api_key=None, return_full_response=False):
geocode_url = "https://maps.googleapis.com/maps/api/geocode/json?address={}".format(address)
if api_key is not None:
geocode_url = geocode_url + "&key={}".format(api_key)
#ping google for the results:
results = requests.get(geocode_url)
results = results.json()
if len(results['results']) == 0:
output = {
"formatted_address" : None,
"latitude": None,
"longitude": None,
"accuracy": None,
"google_place_id": None,
"type": None,
"postcode": None
}
else:
answer = results['results'][0]
output = {
"formatted_address" : answer.get('formatted_address'),
"latitude": answer.get('geometry').get('location').get('lat'),
"longitude": answer.get('geometry').get('location').get('lng'),
"accuracy": answer.get('geometry').get('location_type'),
"google_place_id": answer.get("place_id"),
"type": ",".join(answer.get('types')),
"postcode": ",".join([x['long_name'] for x in answer.get('address_components')
if 'postal_code' in x.get('types')])
}
#append some other details
output['input_string'] = address
output['number_of_results'] = len(results['results'])
output['status'] = results.get('status')
if return_full_response is True:
output['response'] = results
return output
#test that API key validity, internet access confirmation, and function result are all good
test_result = get_google_results("HABITAT COFFEE SHOP, 3708 N EAGLE ROCK BLVD, LOS ANGELES, CA", API_KEY, RETURN_FULL_RESULTS)
print(test_result)
results2=[]
for address in addresses_to_avoid:
geocode_result = get_google_results(address, API_KEY, return_full_response=RETURN_FULL_RESULTS)
results2.append(geocode_result)
pd.DataFrame(results2).to_csv('../output/restaurants_to_avoid.csv', encoding='utf8')
#mapbox map addresses to avoid
import plotly.graph_objs as go
import plotly.offline as offline
import pandas as pd
#This is the mapbox API token (different from the google token used in the main restaurant code - that provided the lat, long coordinate for each specified address)
#that will be used by the website to validate user access and map the latitude, longitude
mapbox_token = "Insert your mapbox token here"
#modify the path with your path as needed
#df = pd.read_csv('../output-2018.csv')
df = pd.read_csv('../output/restaurants_to_avoid.csv')
df.drop(['formatted_address'], axis=1)
#data parameters from the file to map
trace = go.Scattermapbox(lat = df["latitude"], lon = df["longitude"], text= df["input_string"], marker=go.Marker(size=10), mode="markers+text", textposition="top")
#34.0292265,-118.4639102
data = [trace]
#define map layout - access token (API), which location map should be centered on
layout = go.Layout(mapbox=dict(accesstoken=mapbox_token, center=dict(lat=34.03, lon=-118.46), zoom=11.0))
#finalize data and layout
figure = go.Figure(data = data, layout = layout)
#draw the map already! NOTE: map will open in your default browser - new tab
offline.plot(figure)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment