Skip to content

Instantly share code, notes, and snippets.

View WillKoehrsen's full-sized avatar
🌆
All watched over by machines of loving grace

Will Koehrsen WillKoehrsen

🌆
All watched over by machines of loving grace
View GitHub Profile
# pandas and numpy for data manipulation
import pandas as pd
import numpy as np
# scipy for algorithms
import scipy
from scipy import stats
# pymc3 for Bayesian Inference, pymc built on t
with pm.Model() as duration_model:
# Three parameters to sample
alpha_skew = pm.Normal('alpha_skew', mu=0, tau=0.5, testval=3.0)
mu_ = pm.Normal('mu', mu=0, tau=0.5, testval=7.4)
tau_ = pm.Normal('tau', mu=0, tau=0.5, testval=1.0)
# Duration is a deterministic variable
duration_ = pm.SkewNormal('duration', alpha = alpha_skew, mu = mu_,
sd = 1/tau_, observed = duration)
# bokeh basics
from bokeh.plotting import figure
from bokeh.io import show, output_notebook
# Create a blank figure with labels
p = figure(plot_width = 600, plot_height = 600,
title = 'Example Glyphs',
x_axis_label = 'X', y_axis_label = 'Y')
# Example data
# Slider to select the binwidth, value is selected number
binwidth_select = Slider(start = 1, end = 30,
step = 1, value = 5,
title = 'Delay Width (min)')
# Update the plot when the value is changed
binwidth_select.on_change('value', update)
# RangeSlider to change the maximum and minimum values on histogram
range_select = RangeSlider(start = -60, end = 180, value = (-60, 120),
step = 5, title = 'Delay Range (min)')
# Make a separate list for each airline
x1 = list(flights[flights['name'] == 'United Air Lines Inc.']['arr_delay'])
x2 = list(flights[flights['name'] == 'JetBlue Airways']['arr_delay'])
x3 = list(flights[flights['name'] == 'ExpressJet Airlines Inc.']['arr_delay'])
x4 = list(flights[flights['name'] == 'Delta Air Lines Inc.']['arr_delay'])
x5 = list(flights[flights['name'] == 'American Airlines Inc.']['arr_delay'])
# Assign colors for each airline and the names
colors = ['#E69F00', '#56B4E9', '#F0E442', '#009E73', '#D55E00']
names = ['United Air Lines Inc.', 'JetBlue Airways', 'ExpressJet Airlines Inc.'',
# Subset to Alaska Airlines
subset = flights[flights['name'] == 'Alaska Airlines Inc.']
# Density Plot with Rug Plot
sns.distplot(subset['arr_delay'], hist = False, kde = True, rug = True,
color = 'darkblue',
kde_kws={'linewidth': 3},
rug_kws={'color': 'black'})
# Plot formatting
def format_data(df):
# Targets are final grade of student
labels = df['G3']
# Drop the school and the grades from features
df = df.drop(columns=['school', 'G1', 'G2', 'G3'])
# One-Hot Encoding of Categorical Variables
df = pd.get_dummies(df)
df['y'] = list(labels)
# Context for the model
with pm.Model() as normal_model:
# The prior for the model parameters will be a normal distribution
family = pm.glm.families.Normal()
# Making the model only requires specifying the formula and the data
pm.GLM.from_formula(formula, X_train_math, family = family)
# Perform Markov Chain Monte Carlo sampling
# X_train is our training data, we will make a copy for plotting
X_plot = X_train.copy()
# Compare grades to the median
X_plot['relation_median'] = (X_plot['Grade'] >= 12)
X_plot['Grade'] = X_plot['Grade'].replace({True: 'above',
False: 'below'})
# Plot all variables in a loop
plt.figure(figsize=(12, 12))
for i, col in enumerate(X_plot.columns[:-1]):
plt.subplot(3, 2, i + 1)
# Replace the string for missing values with not a number
data = data.replace({'Not Available': np.nan})
# Iterate through the columns
for col in list(data.columns):
# Select columns that should be numeric using string matching
if ('ft²' in col or 'kBtu' in col or 'Metric Tons CO2e' in col or 'kWh' in
col or 'therms' in col or 'gal' in col or 'Score' in col):
# Convert the data type to float
data[col] = data[col].astype(float)