This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pandas and numpy for data manipulation | |
import pandas as pd | |
import numpy as np | |
# scipy for algorithms | |
import scipy | |
from scipy import stats | |
# pymc3 for Bayesian Inference, pymc built on t |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
with pm.Model() as duration_model: | |
# Three parameters to sample | |
alpha_skew = pm.Normal('alpha_skew', mu=0, tau=0.5, testval=3.0) | |
mu_ = pm.Normal('mu', mu=0, tau=0.5, testval=7.4) | |
tau_ = pm.Normal('tau', mu=0, tau=0.5, testval=1.0) | |
# Duration is a deterministic variable | |
duration_ = pm.SkewNormal('duration', alpha = alpha_skew, mu = mu_, | |
sd = 1/tau_, observed = duration) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# bokeh basics | |
from bokeh.plotting import figure | |
from bokeh.io import show, output_notebook | |
# Create a blank figure with labels | |
p = figure(plot_width = 600, plot_height = 600, | |
title = 'Example Glyphs', | |
x_axis_label = 'X', y_axis_label = 'Y') | |
# Example data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Slider to select the binwidth, value is selected number | |
binwidth_select = Slider(start = 1, end = 30, | |
step = 1, value = 5, | |
title = 'Delay Width (min)') | |
# Update the plot when the value is changed | |
binwidth_select.on_change('value', update) | |
# RangeSlider to change the maximum and minimum values on histogram | |
range_select = RangeSlider(start = -60, end = 180, value = (-60, 120), | |
step = 5, title = 'Delay Range (min)') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Make a separate list for each airline | |
x1 = list(flights[flights['name'] == 'United Air Lines Inc.']['arr_delay']) | |
x2 = list(flights[flights['name'] == 'JetBlue Airways']['arr_delay']) | |
x3 = list(flights[flights['name'] == 'ExpressJet Airlines Inc.']['arr_delay']) | |
x4 = list(flights[flights['name'] == 'Delta Air Lines Inc.']['arr_delay']) | |
x5 = list(flights[flights['name'] == 'American Airlines Inc.']['arr_delay']) | |
# Assign colors for each airline and the names | |
colors = ['#E69F00', '#56B4E9', '#F0E442', '#009E73', '#D55E00'] | |
names = ['United Air Lines Inc.', 'JetBlue Airways', 'ExpressJet Airlines Inc.'', |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Subset to Alaska Airlines | |
subset = flights[flights['name'] == 'Alaska Airlines Inc.'] | |
# Density Plot with Rug Plot | |
sns.distplot(subset['arr_delay'], hist = False, kde = True, rug = True, | |
color = 'darkblue', | |
kde_kws={'linewidth': 3}, | |
rug_kws={'color': 'black'}) | |
# Plot formatting |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def format_data(df): | |
# Targets are final grade of student | |
labels = df['G3'] | |
# Drop the school and the grades from features | |
df = df.drop(columns=['school', 'G1', 'G2', 'G3']) | |
# One-Hot Encoding of Categorical Variables | |
df = pd.get_dummies(df) | |
df['y'] = list(labels) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Context for the model | |
with pm.Model() as normal_model: | |
# The prior for the model parameters will be a normal distribution | |
family = pm.glm.families.Normal() | |
# Making the model only requires specifying the formula and the data | |
pm.GLM.from_formula(formula, X_train_math, family = family) | |
# Perform Markov Chain Monte Carlo sampling |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# X_train is our training data, we will make a copy for plotting | |
X_plot = X_train.copy() | |
# Compare grades to the median | |
X_plot['relation_median'] = (X_plot['Grade'] >= 12) | |
X_plot['Grade'] = X_plot['Grade'].replace({True: 'above', | |
False: 'below'}) | |
# Plot all variables in a loop | |
plt.figure(figsize=(12, 12)) | |
for i, col in enumerate(X_plot.columns[:-1]): | |
plt.subplot(3, 2, i + 1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Replace the string for missing values with not a number | |
data = data.replace({'Not Available': np.nan}) | |
# Iterate through the columns | |
for col in list(data.columns): | |
# Select columns that should be numeric using string matching | |
if ('ft²' in col or 'kBtu' in col or 'Metric Tons CO2e' in col or 'kWh' in | |
col or 'therms' in col or 'gal' in col or 'Score' in col): | |
# Convert the data type to float | |
data[col] = data[col].astype(float) |
OlderNewer