Skip to content

Instantly share code, notes, and snippets.

@rwilleynyc
rwilleynyc / pdp_plot.py
Last active May 14, 2020 17:16
Plots a prettified PDP
def plot_pdp(model, X, feature, target=False, return_pd=False, y_pct=True, figsize=(10,9), norm_hist=True, dec=.5):
# Get partial dependence
pardep = partial_dependence(model, X, [feature])
# Get min & max values
xmin = pardep[1][0].min()
xmax = pardep[1][0].max()
ymin = pardep[0][0].min()
ymax = pardep[0][0].max()
name review_count rating tags
0 The Calaveras 103 4.5 ['bars', 'mexican', 'tapas', 'small', 'plates', '2']
1 Las Catrinas Mexican Bar & Eatery 301 4.0 ['mexican', 'cocktail', 'bars', '2']
2 Chano's Cantina 165 4.0 ['cocktail', 'bars', 'new', 'mexican', 'cuisine', '2']
3 Maizal Restaurant & Tequila Bar 295 4.0 ['mexican', 'cocktail', 'bars', '2']
4 Juquila Kitchen and Bar 98 4.0 ['new', 'mexican', 'cuisine', 'tacos', 'cocktail', 'bars', '2']
We can make this file beautiful and searchable if this error is corrected: Unclosed quoted field in line 10.
name,categories,rating,price
Tacuba,"[{'alias': 'mexican', 'title': 'Mexican'}, {'alias': 'tapas', 'title': 'Tapas Bars'}, {'alias': 'latin', 'title': 'Latin American'}]",3.5,$$
Mi Espiguita Taqueria,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.5,$
El Mero Mero,"[{'alias': 'mexican', 'title': 'Mexican'}]",4.5,$$
Hoja Santa,"[{'alias': 'newmexican', 'title': 'New Mexican Cuisine'}, {'alias': 'mexican', 'title': 'Mexican'}]",4.5,
The Calaveras,"[{'alias': 'bars', 'title': 'Bars'}, {'alias': 'mexican', 'title': 'Mexican'}, {'alias': 'tapasmallplates', 'title': 'Tapas/Small Plates'}]",4.5,$$
Athens Grill & Sports Bar,"[{'alias': 'mexican', 'title': 'Mexican'}]",4,$
Mezquite Restaurant,"[{'alias': 'mexican', 'title': 'Mexican'}, {'alias': 'peruvian', 'title': 'Peruvian'}, {'alias': 'seafood', 'title': 'Seafood'}]",4,$$
Fresco's Cantina,"[{'alias': 'mexican', 'title': 'Mexican'}, {'alias': 'latin', 'title': 'Latin American'}, {'alias': 'newmexican', 'title': 'New Mexican Cuisine'}]",4.5,$$
Chela & Garnacha,"[{'al
@rwilleynyc
rwilleynyc / search_results.csv
Created November 23, 2019 22:45
Sample search results from Yelp API
We can make this file beautiful and searchable if this error is corrected: It looks like row 3 should actually have 17 columns, instead of 5 in line 2.
,id,alias,name,image_url,is_closed,url,review_count,categories,rating,coordinates,transactions,price,location,phone,display_phone,distance
0,EHUNrnIgnhwTnpOm3gEESg,tacuba-astoria,Tacuba,https://s3-media4.fl.yelpcdn.com/bphoto/Q6jPz4xg6QPh4NElULobrA/o.jpg,False,https://www.yelp.com/biz/tacuba-astoria?adjust_creative=TKz2edtltxgYtPzgSrH9EQ&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=TKz2edtltxgYtPzgSrH9EQ,645,"[{'alias': 'mexican', 'title': 'Mexican'}, {'alias': 'tapas', 'title': 'Tapas Bars'}, {'alias': 'latin', 'title': 'Latin American'}]",3.5,"{'latitude': 40.75585, 'longitude': -73.92447}","['delivery', 'pickup']",$$,"{'address1': '35-01 36th St', 'address2': '', 'address3': '', 'city': 'Astoria', 'zip_code': '11106', 'country': 'US', 'state': 'NY', 'display_address': ['35-01 36th St', 'Astoria, NY 11106']}",+17187862727,(718) 786-2727,1229.3517053099579
1,yvva7IYpD6J7OfKlCdQrkw,mi-espiguita-taqueria-astoria,Mi Espiguita Taqueria,https://s3-media2.fl.yelpcdn.com/bphoto/TEho39G01VJX
@rwilleynyc
rwilleynyc / guns_neighbor_gpa.py
Created September 17, 2019 16:01
Calculate neighbor gpa
# Create nested dictionaries to map grades to year and neighbor states
neighbor_grades = {}
for year in df.Year.unique():
neighbor_grades[year] = {}
for state in df.State.unique():
# Default score is state score if no neighboring state (i.e. Alaska)
default_score = df[(df.State==state) & (df.Year==year)]['State Grade'].values[0]
scores = []
@rwilleynyc
rwilleynyc / guns_neighbor_states.py
Last active September 17, 2019 15:58
Get list of neighboring states and save to dictionary
import urllib.request
url = 'https://github.com/ritvikmath/StarbucksStoreScraping/raw/master/us_states.geojson'
urllib.request.urlretrieve(url, 'international_data/us_states.geojson')
#manipulate complex shapes
from shapely.geometry import Polygon, MultiPolygon
#manipulate json objects
import json
@rwilleynyc
rwilleynyc / NW_region_ttest.py
Created August 8, 2019 19:06
Get t_test results for each region
import scipy.stats
# Get unique list of regions
regions = df['Region'].unique()
# For each region, compare quantities of the region against rest of world
for region in regions:
test = df['Quantity'].loc[(df['Region'] == region)]
control = df['Quantity'].loc[(df['Region'] != region)]
results = stats.ttest_ind(test, control)
@rwilleynyc
rwilleynyc / NW_anova_results.csv
Created August 8, 2019 18:35
anova results output
variable_name sum_sq df F PR(>F)
C(Employee) 8259.303410743278 8.0 3.010142879436235 0.00231317377883642
C(Region) 40031.87438364985 8.0 14.589809289447707 6.6425808706248666e-21
C(CategoryName) 959.7513428085351 7.0 0.3997554241095136 0.9028853563957704
Discount 6925.520309945907 1.0 20.19231367164281 7.392950240262248e-06
Residual 704133.9307385169 2053.0
@rwilleynyc
rwilleynyc / NW_anova.py
Last active August 8, 2019 18:38
run anova test in python
import statsmodels.api as sm
from statsmodels.formula.api import ols
# Define target & independent variables
# C(var_name) indicates variable is categorical
formula = 'Quantity ~ Discount + C(Employee) + C(Region) + C(CategoryName)'
# Fit ordinary least squares model to data
lm = ols(formula, df).fit()
OrderId UnitPrice Quantity Discount CustomerId Region CategoryName Employee
10248 14.0 12 0.0 VINET Western Europe Dairy Products Buchanan5
10248 9.8 10 0.0 VINET Western Europe Grains/Cereals Buchanan5
10248 34.8 5 0.0 VINET Western Europe Dairy Products Buchanan5
10249 18.6 9 0.0 TOMSP Western Europe Produce Suyama6
10249 42.4 40 0.0 TOMSP Western Europe Produce Suyama6
10250 7.7 10 0.0 HANAR South America Seafood Peacock4
10250 42.4 35 0.15 HANAR South America Produce Peacock4
10250 16.8 15 0.15 HANAR South America Condiments Peacock4
10251 16.8 6 0.05 VICTE Western Europe Grains/Cereals Leverling3