HariSan1/LA-restaurant-analysis-with-mapping Secret

## LA-restaurant-analysis-with-mapping
#TSB
#Hari Santanam, 2018
#import tools required to run the code
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import requests
import logging
import time
import seaborn as sns

#nRowsRead = 100  - use this if you only want to test the waters and not import the whole file, remember to uncomment until the hyphen :)
#read in file with all inspections and put in dataframe
df1 = pd.read_csv('../input/restaurant-and-market-health-inspections.csv', delimiter=',')
nRow, nCol = df1.shape
#print number of rows and columns
print({nRow}, {nCol})

#read in file with all health VIOLATIONS and put in dataframe
df2 = pd.read_csv('../input/restaurant-and-market-health-violations.csv')

from matplotlib import reload
reload(plt)
%matplotlib notebook
df1['score']=df1['score'].astype(int) #make score of type INTEGER

score_dist_hist = df1['score'].plot(kind = "hist", title = "Score Distribution",  figsize = (4,3), alpha = 0.5)
score_dist_hist.set_xlabel("score - values")

df2['score'].hist().plot()

#print the top 10 rows of the violations dataframe (df2)
df2.head(10)

#group by restaurant type and count # in each category
#then sort from highest # to lowest, then create a bar graph
reload(plt)
%matplotlib notebook
temp = df1.groupby('pe_description').size()
description_distribution = pd.DataFrame({'Count':temp.values}, index=temp.index)
description_distribution = description_distribution.sort_values(by=['Count'], ascending=True)
df2['pe_description'].hist().plot()

#the previous charts and graphs show breakdown of various types food restaurants with risk
#broken down to high, medium, low.
#This procedure use the split function to break the pe_description field into the sub-string
#after the 2nd space from the end - ex: string x = "Aa Bb Cc", when split applied like this: x.split(' ')[-2] ->sub-string after(not before) 2nd space '=>Bb'
def sub_risk(x):
    return x.split(' ')[-2]

df2['risk'] = df2['pe_description'].astype(str).apply(sub_risk)         #apply function to get only high, medium, low
temp = df2.groupby('risk').size()                                       #group, count by risk level
#plot the histogram for the 3 levels of risk
reload(plt)
%matplotlib notebook
df2['risk'].hist().plot()

#calculate and plot pie chart for risk
risk_distribution = pd.DataFrame({'Count':temp.values}, index = temp.index)
risk_distribution
ax2 = risk_distribution['Count'].plot(kind="pie", legend=True,autopct='%.2f', figsize=(6, 6))

#show first 10 rows of the violations file dataframe
df2.head(10)
#groupb by violation_description, count and sort them from largest violation by count to smallest
violation_description = df2.groupby('violation_description').size()
pd.DataFrame({'Count':violation_description.values},index = violation_description.index).sort_values(by = 'Count',ascending=False)

df2['pe_description'].value_counts()

#create a simple proc for heat map for risk - low, moderate, high
def convert_risk_value(x):
    if x == 'LOW':
        return 10
    elif x == 'MODERATE':
        return 5
    else:
        return 0
#create simple proc to map grade to value
def convert_grade_value(x):
    if x == 'A':
        return 10
    elif x == 'B':
        return 5
    else:
        return 0
#call (apply) procs created above
df2['risk_value']=df2['risk'].apply(convert_risk_value)
df2['grade_value']=df2['grade'].apply(convert_grade_value)
df3 = df2.loc[:,['score', 'grade_value', 'risk_value']]
corr = df3.corr()
corr = (corr)
reload(plt)
%matplotlib notebook
sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels=corr.columns.values, cmap="Purples", center=0)

#top 20 facilities with most restaurants / markets
facility_dist = df1.groupby(['facility_id', 'facility_name']).size()
top20_facilities = facility_dist.sort_values(ascending=False).head(20)
pd.DataFrame({'Count':top20_facilities.values}, index=top20_facilities.index)

#top 30 owners with most restaurants / markets
owner_dist = df1.groupby(['owner_id', 'owner_name']).size()
top30_owners = owner_dist.sort_values(ascending=False).head(30)
pd.DataFrame({'Count':top30_owners.values}, index= top30_owners.index)

#violations listing from most common, descending - violation description, violation code, counts
violation_desc=df2.groupby(['violation_description','violation_code']).size()
pd.DataFrame({'Count':violation_desc.values}, index=violation_desc.index).sort_values(by = 'Count', ascending=False)

#list facilities with most violations and type of violation
#create a dataframe with facility and violation columns, aggregate by size, then count and sort them
violation_desc2 = df2.groupby(['facility_name','violation_description']).size()
pd.DataFrame({'Count':violation_desc2.values}, index=violation_desc2.index).sort_values(by='Count', ascending=False)

df1.head(10)

#get a list of all the restaurants with grade C
df4 = df2.loc[(df2['grade'] == 'C'),['facility_name','facility_address','facility_zip']]
df4=df4.drop_duplicates(['facility_name'])  #only want each restaurant listed once, since many of them have multiple violations
df4

#visualize bad restaurants (grade C)on a map, so that if you are in that area, you can avoid them :)
#some of them might have remediated their violations, or may be operating under "new management" or maybe even turned over a new leaf - we just don't know
addresses_to_avoid = df4['facility_address'].tolist()
addresses_to_avoid = (df4['facility_name'] + ',' + df4['facility_address'] + ',' + 'LOS ANGELES' + ',CA').tolist()

#filter out by specific violation - in this case I picked Impoundment of unsanitary equipment or food - and list violators
df2.loc[(df2['violation_description'] == "# 50. Impoundment of unsanitary equipment or food") & (df2['violation_status']=="OUT OF COMPLIANCE"), ['facility_name','facility_address', 'violation_description','activity_date']]

addresses_for_violation_50= df2.loc[(df2['violation_description'] == "# 50. Impoundment of unsanitary equipment or food") & (df2['violation_status']=="OUT OF COMPLIANCE"), ['facility_name','facility_address']]
print(addresses_for_violation_50)

#this cell creates and prints breakdown list of violators for each violation code
#first, get a unique list of violations -  .unique()
#then, use a for loop to go from index 0 to the max number of items [index] for violation codes
#for each unique violation, get a list of offenders
temp1 = df2.violation_description.unique()
print(len(temp1))                          #print the total number of UNIQUE violations in the database
for i in range(0,len(temp1)):              #for each one, get the list of restaurants that match it in dataset
    temp2 = df2.loc[(df2['violation_description'] == temp1[i]) & (df2['violation_status']=="OUT OF COMPLIANCE"), ['violation_code','facility_name','facility_address']]
    print(temp2)

# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        printf('No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title('Correlation Matrix for {filename}', fontsize=15)
    plt.show()

# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()

df1.dataframeName = 'restaurant-and-market-health-inspections.csv'
df2.dataframeName = 'restaurant-and-market-health-violations.csv'
plotCorrelationMatrix(df2, 8)

plotScatterMatrix(df1, 9, 10)

address_column_name = 'facility_address'
restaurant_name = 'facility_name'
RETURN_FULL_RESULTS = False
BACKOFF_TIME = 30

logger = logging.getLogger("root")
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()      #console handler
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)

API_KEY = 'your API key here'
output_filename = '../output/output-2018.csv'
#print(addresses)

#adapted from Shane Lynn - thanks
def get_google_results(address, api_key=None, return_full_response=False):
    geocode_url = "https://maps.googleapis.com/maps/api/geocode/json?address={}".format(address)
    if api_key is not None:
        geocode_url = geocode_url + "&key={}".format(api_key)

        #ping google for the results:
        results = requests.get(geocode_url)
        results = results.json()

        if len(results['results']) == 0:
            output = {
                "formatted_address" : None,
                "latitude": None,
                "longitude": None,
                "accuracy": None,
                "google_place_id": None,
                "type": None,
                "postcode": None
            }
        else:
            answer = results['results'][0]
            output = {
                "formatted_address" : answer.get('formatted_address'),
                "latitude": answer.get('geometry').get('location').get('lat'),
                "longitude": answer.get('geometry').get('location').get('lng'),
                "accuracy": answer.get('geometry').get('location_type'),
                "google_place_id": answer.get("place_id"),
                "type": ",".join(answer.get('types')),
                "postcode": ",".join([x['long_name'] for x in answer.get('address_components')
                                  if 'postal_code' in x.get('types')])
            }

        #append some other details
        output['input_string'] = address
        output['number_of_results'] = len(results['results'])
        output['status'] = results.get('status')
        if return_full_response is True:
            output['response'] = results

        return output

 #test that API key validity, internet access confirmation, and function result are all good
test_result = get_google_results("HABITAT COFFEE SHOP, 3708 N EAGLE ROCK BLVD, LOS ANGELES, CA", API_KEY, RETURN_FULL_RESULTS)

print(test_result)

results2=[]
for address in addresses_to_avoid:
    geocode_result = get_google_results(address, API_KEY, return_full_response=RETURN_FULL_RESULTS)
    results2.append(geocode_result)

    pd.DataFrame(results2).to_csv('../output/restaurants_to_avoid.csv', encoding='utf8')

#mapbox map addresses to avoid
import plotly.graph_objs as go
import plotly.offline as offline
import pandas as pd

#This is the mapbox API token (different from the google token used in the main restaurant code - that provided the lat, long coordinate for each specified address)
#that will be used by the website to validate user access and map the latitude, longitude
mapbox_token = "Insert your mapbox token here"

#modify the path with your path as needed
#df = pd.read_csv('../output-2018.csv')
df = pd.read_csv('../output/restaurants_to_avoid.csv')
df.drop(['formatted_address'], axis=1)

#data parameters from the file to map
trace = go.Scattermapbox(lat = df["latitude"], lon = df["longitude"], text= df["input_string"], marker=go.Marker(size=10), mode="markers+text", textposition="top")
#34.0292265,-118.4639102
data = [trace]

#define map layout - access token (API), which location map should be centered on
layout = go.Layout(mapbox=dict(accesstoken=mapbox_token, center=dict(lat=34.03, lon=-118.46), zoom=11.0))

#finalize data and layout
figure = go.Figure(data = data, layout = layout)

#draw the map already! NOTE: map will open in your default browser - new tab
offline.plot(figure)
	#TSB
	#Hari Santanam, 2018
	#import tools required to run the code
	from mpl_toolkits.mplot3d import Axes3D
	from sklearn.preprocessing import StandardScaler
	import matplotlib.pyplot as plt
	import numpy as np
	import os
	import pandas as pd
	import requests
	import logging
	import time
	import seaborn as sns

	#nRowsRead = 100 - use this if you only want to test the waters and not import the whole file, remember to uncomment until the hyphen :)
	#read in file with all inspections and put in dataframe
	df1 = pd.read_csv('../input/restaurant-and-market-health-inspections.csv', delimiter=',')
	nRow, nCol = df1.shape
	#print number of rows and columns
	print({nRow}, {nCol})

	#read in file with all health VIOLATIONS and put in dataframe
	df2 = pd.read_csv('../input/restaurant-and-market-health-violations.csv')

	from matplotlib import reload
	reload(plt)
	%matplotlib notebook
	df1['score']=df1['score'].astype(int) #make score of type INTEGER

	score_dist_hist = df1['score'].plot(kind = "hist", title = "Score Distribution", figsize = (4,3), alpha = 0.5)
	score_dist_hist.set_xlabel("score - values")

	df2['score'].hist().plot()

	#print the top 10 rows of the violations dataframe (df2)
	df2.head(10)

	#group by restaurant type and count # in each category
	#then sort from highest # to lowest, then create a bar graph
	reload(plt)
	%matplotlib notebook
	temp = df1.groupby('pe_description').size()
	description_distribution = pd.DataFrame({'Count':temp.values}, index=temp.index)
	description_distribution = description_distribution.sort_values(by=['Count'], ascending=True)
	df2['pe_description'].hist().plot()

	#the previous charts and graphs show breakdown of various types food restaurants with risk
	#broken down to high, medium, low.
	#This procedure use the split function to break the pe_description field into the sub-string
	#after the 2nd space from the end - ex: string x = "Aa Bb Cc", when split applied like this: x.split(' ')[-2] ->sub-string after(not before) 2nd space '=>Bb'
	def sub_risk(x):
	return x.split(' ')[-2]

	df2['risk'] = df2['pe_description'].astype(str).apply(sub_risk) #apply function to get only high, medium, low
	temp = df2.groupby('risk').size() #group, count by risk level
	#plot the histogram for the 3 levels of risk
	reload(plt)
	%matplotlib notebook
	df2['risk'].hist().plot()

	#calculate and plot pie chart for risk
	risk_distribution = pd.DataFrame({'Count':temp.values}, index = temp.index)
	risk_distribution
	ax2 = risk_distribution['Count'].plot(kind="pie", legend=True,autopct='%.2f', figsize=(6, 6))

	#show first 10 rows of the violations file dataframe
	df2.head(10)
	#groupb by violation_description, count and sort them from largest violation by count to smallest
	violation_description = df2.groupby('violation_description').size()
	pd.DataFrame({'Count':violation_description.values},index = violation_description.index).sort_values(by = 'Count',ascending=False)

	df2['pe_description'].value_counts()

	#create a simple proc for heat map for risk - low, moderate, high
	def convert_risk_value(x):
	if x == 'LOW':
	return 10
	elif x == 'MODERATE':
	return 5
	else:
	return 0
	#create simple proc to map grade to value
	def convert_grade_value(x):
	if x == 'A':
	return 10
	elif x == 'B':
	return 5
	else:
	return 0
	#call (apply) procs created above
	df2['risk_value']=df2['risk'].apply(convert_risk_value)
	df2['grade_value']=df2['grade'].apply(convert_grade_value)
	df3 = df2.loc[:,['score', 'grade_value', 'risk_value']]
	corr = df3.corr()
	corr = (corr)
	reload(plt)
	%matplotlib notebook
	sns.heatmap(corr, xticklabels = corr.columns.values, yticklabels=corr.columns.values, cmap="Purples", center=0)

	#top 20 facilities with most restaurants / markets
	facility_dist = df1.groupby(['facility_id', 'facility_name']).size()
	top20_facilities = facility_dist.sort_values(ascending=False).head(20)
	pd.DataFrame({'Count':top20_facilities.values}, index=top20_facilities.index)

	#top 30 owners with most restaurants / markets
	owner_dist = df1.groupby(['owner_id', 'owner_name']).size()
	top30_owners = owner_dist.sort_values(ascending=False).head(30)
	pd.DataFrame({'Count':top30_owners.values}, index= top30_owners.index)

	#violations listing from most common, descending - violation description, violation code, counts
	violation_desc=df2.groupby(['violation_description','violation_code']).size()
	pd.DataFrame({'Count':violation_desc.values}, index=violation_desc.index).sort_values(by = 'Count', ascending=False)

	#list facilities with most violations and type of violation
	#create a dataframe with facility and violation columns, aggregate by size, then count and sort them
	violation_desc2 = df2.groupby(['facility_name','violation_description']).size()
	pd.DataFrame({'Count':violation_desc2.values}, index=violation_desc2.index).sort_values(by='Count', ascending=False)

	df1.head(10)

	#get a list of all the restaurants with grade C
	df4 = df2.loc[(df2['grade'] == 'C'),['facility_name','facility_address','facility_zip']]
	df4=df4.drop_duplicates(['facility_name']) #only want each restaurant listed once, since many of them have multiple violations
	df4

	#visualize bad restaurants (grade C)on a map, so that if you are in that area, you can avoid them :)
	#some of them might have remediated their violations, or may be operating under "new management" or maybe even turned over a new leaf - we just don't know
	addresses_to_avoid = df4['facility_address'].tolist()
	addresses_to_avoid = (df4['facility_name'] + ',' + df4['facility_address'] + ',' + 'LOS ANGELES' + ',CA').tolist()

	#filter out by specific violation - in this case I picked Impoundment of unsanitary equipment or food - and list violators
	df2.loc[(df2['violation_description'] == "# 50. Impoundment of unsanitary equipment or food") & (df2['violation_status']=="OUT OF COMPLIANCE"), ['facility_name','facility_address', 'violation_description','activity_date']]

	addresses_for_violation_50= df2.loc[(df2['violation_description'] == "# 50. Impoundment of unsanitary equipment or food") & (df2['violation_status']=="OUT OF COMPLIANCE"), ['facility_name','facility_address']]
	print(addresses_for_violation_50)

	#this cell creates and prints breakdown list of violators for each violation code
	#first, get a unique list of violations - .unique()
	#then, use a for loop to go from index 0 to the max number of items [index] for violation codes
	#for each unique violation, get a list of offenders
	temp1 = df2.violation_description.unique()
	print(len(temp1)) #print the total number of UNIQUE violations in the database
	for i in range(0,len(temp1)): #for each one, get the list of restaurants that match it in dataset
	temp2 = df2.loc[(df2['violation_description'] == temp1[i]) & (df2['violation_status']=="OUT OF COMPLIANCE"), ['violation_code','facility_name','facility_address']]
	print(temp2)

	# Correlation matrix
	def plotCorrelationMatrix(df, graphWidth):
	filename = df.dataframeName
	df = df.dropna('columns') # drop columns with NaN
	df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
	if df.shape[1] < 2:
	printf('No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
	return
	corr = df.corr()
	plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
	corrMat = plt.matshow(corr, fignum = 1)
	plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
	plt.yticks(range(len(corr.columns)), corr.columns)
	plt.gca().xaxis.tick_bottom()
	plt.colorbar(corrMat)
	plt.title('Correlation Matrix for {filename}', fontsize=15)
	plt.show()

	# Scatter and density plots
	def plotScatterMatrix(df, plotSize, textSize):
	df = df.select_dtypes(include =[np.number]) # keep only numerical columns
	# Remove rows and columns that would lead to df being singular
	df = df.dropna('columns')
	df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
	columnNames = list(df)
	if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
	columnNames = columnNames[:10]
	df = df[columnNames]
	ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
	corrs = df.corr().values
	for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
	ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
	plt.suptitle('Scatter and Density Plot')
	plt.show()

	df1.dataframeName = 'restaurant-and-market-health-inspections.csv'
	df2.dataframeName = 'restaurant-and-market-health-violations.csv'
	plotCorrelationMatrix(df2, 8)

	plotScatterMatrix(df1, 9, 10)

	address_column_name = 'facility_address'
	restaurant_name = 'facility_name'
	RETURN_FULL_RESULTS = False
	BACKOFF_TIME = 30

	logger = logging.getLogger("root")
	logger.setLevel(logging.DEBUG)
	ch = logging.StreamHandler() #console handler
	ch.setLevel(logging.DEBUG)
	logger.addHandler(ch)

	API_KEY = 'your API key here'
	output_filename = '../output/output-2018.csv'
	#print(addresses)

	#adapted from Shane Lynn - thanks
	def get_google_results(address, api_key=None, return_full_response=False):
	geocode_url = "https://maps.googleapis.com/maps/api/geocode/json?address={}".format(address)
	if api_key is not None:
	geocode_url = geocode_url + "&key={}".format(api_key)

	#ping google for the results:
	results = requests.get(geocode_url)
	results = results.json()

	if len(results['results']) == 0:
	output = {
	"formatted_address" : None,
	"latitude": None,
	"longitude": None,
	"accuracy": None,
	"google_place_id": None,
	"type": None,
	"postcode": None
	}
	else:
	answer = results['results'][0]
	output = {
	"formatted_address" : answer.get('formatted_address'),
	"latitude": answer.get('geometry').get('location').get('lat'),
	"longitude": answer.get('geometry').get('location').get('lng'),
	"accuracy": answer.get('geometry').get('location_type'),
	"google_place_id": answer.get("place_id"),
	"type": ",".join(answer.get('types')),
	"postcode": ",".join([x['long_name'] for x in answer.get('address_components')
	if 'postal_code' in x.get('types')])
	}

	#append some other details
	output['input_string'] = address
	output['number_of_results'] = len(results['results'])
	output['status'] = results.get('status')
	if return_full_response is True:
	output['response'] = results

	return output

	#test that API key validity, internet access confirmation, and function result are all good
	test_result = get_google_results("HABITAT COFFEE SHOP, 3708 N EAGLE ROCK BLVD, LOS ANGELES, CA", API_KEY, RETURN_FULL_RESULTS)

	print(test_result)

	results2=[]
	for address in addresses_to_avoid:
	geocode_result = get_google_results(address, API_KEY, return_full_response=RETURN_FULL_RESULTS)
	results2.append(geocode_result)

	pd.DataFrame(results2).to_csv('../output/restaurants_to_avoid.csv', encoding='utf8')

	#mapbox map addresses to avoid
	import plotly.graph_objs as go
	import plotly.offline as offline
	import pandas as pd

	#This is the mapbox API token (different from the google token used in the main restaurant code - that provided the lat, long coordinate for each specified address)
	#that will be used by the website to validate user access and map the latitude, longitude
	mapbox_token = "Insert your mapbox token here"

	#modify the path with your path as needed
	#df = pd.read_csv('../output-2018.csv')
	df = pd.read_csv('../output/restaurants_to_avoid.csv')
	df.drop(['formatted_address'], axis=1)

	#data parameters from the file to map
	trace = go.Scattermapbox(lat = df["latitude"], lon = df["longitude"], text= df["input_string"], marker=go.Marker(size=10), mode="markers+text", textposition="top")
	#34.0292265,-118.4639102
	data = [trace]

	#define map layout - access token (API), which location map should be centered on
	layout = go.Layout(mapbox=dict(accesstoken=mapbox_token, center=dict(lat=34.03, lon=-118.46), zoom=11.0))

	#finalize data and layout
	figure = go.Figure(data = data, layout = layout)

	#draw the map already! NOTE: map will open in your default browser - new tab
	offline.plot(figure)