Skip to content

Instantly share code, notes, and snippets.

Avatar
🌆
Improving

Will Koehrsen WillKoehrsen

🌆
Improving
View GitHub Profile
View gist:ec1bd17a1c6c2733cdabdaa36d709d49
# pandas and numpy for data manipulation
import pandas as pd
import numpy as np
# scipy for algorithms
import scipy
from scipy import stats
# pymc3 for Bayesian Inference, pymc built on t
View duration_model.py
with pm.Model() as duration_model:
# Three parameters to sample
alpha_skew = pm.Normal('alpha_skew', mu=0, tau=0.5, testval=3.0)
mu_ = pm.Normal('mu', mu=0, tau=0.5, testval=7.4)
tau_ = pm.Normal('tau', mu=0, tau=0.5, testval=1.0)
# Duration is a deterministic variable
duration_ = pm.SkewNormal('duration', alpha = alpha_skew, mu = mu_,
sd = 1/tau_, observed = duration)
View basic_bokeh_plot.py
# bokeh basics
from bokeh.plotting import figure
from bokeh.io import show, output_notebook
# Create a blank figure with labels
p = figure(plot_width = 600, plot_height = 600,
title = 'Example Glyphs',
x_axis_label = 'X', y_axis_label = 'Y')
# Example data
View more_controls.py
# Slider to select the binwidth, value is selected number
binwidth_select = Slider(start = 1, end = 30,
step = 1, value = 5,
title = 'Delay Width (min)')
# Update the plot when the value is changed
binwidth_select.on_change('value', update)
# RangeSlider to change the maximum and minimum values on histogram
range_select = RangeSlider(start = -60, end = 180, value = (-60, 120),
step = 5, title = 'Delay Range (min)')
View different_binwidths.py
# Show 4 different binwidths
for i, binwidth in enumerate([1, 5, 10, 15]):
# Set up the plot
ax = plt.subplot(2, 2, i + 1)
# Draw the plot
ax.hist(flights['arr_delay'], bins = int(180/binwidth),
color = 'blue', edgecolor = 'black')
View side_by_side_histogram.py
# Make a separate list for each airline
x1 = list(flights[flights['name'] == 'United Air Lines Inc.']['arr_delay'])
x2 = list(flights[flights['name'] == 'JetBlue Airways']['arr_delay'])
x3 = list(flights[flights['name'] == 'ExpressJet Airlines Inc.']['arr_delay'])
x4 = list(flights[flights['name'] == 'Delta Air Lines Inc.']['arr_delay'])
x5 = list(flights[flights['name'] == 'American Airlines Inc.']['arr_delay'])
# Assign colors for each airline and the names
colors = ['#E69F00', '#56B4E9', '#F0E442', '#009E73', '#D55E00']
names = ['United Air Lines Inc.', 'JetBlue Airways', 'ExpressJet Airlines Inc.'',
View density_rugplot_alaska.py
# Subset to Alaska Airlines
subset = flights[flights['name'] == 'Alaska Airlines Inc.']
# Density Plot with Rug Plot
sns.distplot(subset['arr_delay'], hist = False, kde = True, rug = True,
color = 'darkblue',
kde_kws={'linewidth': 3},
rug_kws={'color': 'black'})
# Plot formatting
View data_preparation.py
def format_data(df):
# Targets are final grade of student
labels = df['G3']
# Drop the school and the grades from features
df = df.drop(columns=['school', 'G1', 'G2', 'G3'])
# One-Hot Encoding of Categorical Variables
df = pd.get_dummies(df)
df['y'] = list(labels)
View bayesian_linear_model.py
# Context for the model
with pm.Model() as normal_model:
# The prior for the model parameters will be a normal distribution
family = pm.glm.families.Normal()
# Making the model only requires specifying the formula and the data
pm.GLM.from_formula(formula, X_train_math, family = family)
# Perform Markov Chain Monte Carlo sampling
View query_vars.py
# Examines the effect of changing a single variable
# Takes in the name of the variable, the trace, and the data
def model_effect(query_var, trace, X):
# Variables that do not change
steady_vars = list(X.columns)
steady_vars.remove(query_var)
# Linear Model that estimates a grade based on the value of the query variable
# and one sample from the trace