Will Koehrsen WillKoehrsen

## gist:ec1bd17a1c6c2733cdabdaa36d709d49

# pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# scipy for algorithms
import scipy
from scipy import stats

# pymc3 for Bayesian Inference, pymc built on t

## duration_model.py
with pm.Model() as duration_model:
    # Three parameters to sample
    alpha_skew = pm.Normal('alpha_skew', mu=0, tau=0.5, testval=3.0)
    mu_ = pm.Normal('mu', mu=0, tau=0.5, testval=7.4)
    tau_ = pm.Normal('tau', mu=0, tau=0.5, testval=1.0)

    # Duration is a deterministic variable
    duration_ = pm.SkewNormal('duration', alpha = alpha_skew, mu = mu_,
                              sd = 1/tau_, observed = duration)


## basic_bokeh_plot.py
# bokeh basics
from bokeh.plotting import figure
from bokeh.io import show, output_notebook

# Create a blank figure with labels
p = figure(plot_width = 600, plot_height = 600,
           title = 'Example Glyphs',
           x_axis_label = 'X', y_axis_label = 'Y')

# Example data

## more_controls.py
# Slider to select the binwidth, value is selected number
binwidth_select = Slider(start = 1, end = 30,
                     step = 1, value = 5,
                     title = 'Delay Width (min)')
# Update the plot when the value is changed
binwidth_select.on_change('value', update)

# RangeSlider to change the maximum and minimum values on histogram
range_select = RangeSlider(start = -60, end = 180, value = (-60, 120),
                           step = 5, title = 'Delay Range (min)')

## side_by_side_histogram.py
# Make a separate list for each airline
x1 = list(flights[flights['name'] == 'United Air Lines Inc.']['arr_delay'])
x2 = list(flights[flights['name'] == 'JetBlue Airways']['arr_delay'])
x3 = list(flights[flights['name'] == 'ExpressJet Airlines Inc.']['arr_delay'])
x4 = list(flights[flights['name'] == 'Delta Air Lines Inc.']['arr_delay'])
x5 = list(flights[flights['name'] == 'American Airlines Inc.']['arr_delay'])

# Assign colors for each airline and the names
colors = ['#E69F00', '#56B4E9', '#F0E442', '#009E73', '#D55E00']
names = ['United Air Lines Inc.', 'JetBlue Airways', 'ExpressJet Airlines Inc.'',

## density_rugplot_alaska.py
# Subset to Alaska Airlines
subset = flights[flights['name'] == 'Alaska Airlines Inc.']

# Density Plot with Rug Plot
sns.distplot(subset['arr_delay'], hist = False, kde = True, rug = True,
             color = 'darkblue',
             kde_kws={'linewidth': 3},
             rug_kws={'color': 'black'})

# Plot formatting

## data_preparation.py
def format_data(df):
    # Targets are final grade of student
    labels = df['G3']
    # Drop the school and the grades from features
    df = df.drop(columns=['school', 'G1', 'G2', 'G3'])

    # One-Hot Encoding of Categorical Variables
    df = pd.get_dummies(df)

    df['y'] = list(labels)

## bayesian_linear_model.py
# Context for the model
with pm.Model() as normal_model:

    # The prior for the model parameters will be a normal distribution
    family = pm.glm.families.Normal()

    # Making the model only requires specifying the formula and the data
    pm.GLM.from_formula(formula, X_train_math, family = family)

    # Perform Markov Chain Monte Carlo sampling

## features_dist.py
# X_train is our training data, we will make a copy for plotting
X_plot = X_train.copy()
# Compare grades to the median
X_plot['relation_median'] = (X_plot['Grade'] >= 12)
X_plot['Grade'] = X_plot['Grade'].replace({True: 'above',
                                          False: 'below'})
# Plot all variables in a loop
plt.figure(figsize=(12, 12))
for i, col in enumerate(X_plot.columns[:-1]):
    plt.subplot(3, 2, i + 1)

## column_conversion.py
# Replace the string for missing values with not a number
data = data.replace({'Not Available': np.nan})

# Iterate through the columns
for col in list(data.columns):
    # Select columns that should be numeric using string matching
    if ('ft²' in col or 'kBtu' in col or 'Metric Tons CO2e' in col or 'kWh' in
        col or 'therms' in col or 'gal' in col or 'Score' in col):
        # Convert the data type to float
        data[col] = data[col].astype(float)

	# pandas and numpy for data manipulation
	import pandas as pd
	import numpy as np

	# scipy for algorithms
	import scipy
	from scipy import stats

	# pymc3 for Bayesian Inference, pymc built on t
	with pm.Model() as duration_model:
	# Three parameters to sample
	alpha_skew = pm.Normal('alpha_skew', mu=0, tau=0.5, testval=3.0)
	mu_ = pm.Normal('mu', mu=0, tau=0.5, testval=7.4)
	tau_ = pm.Normal('tau', mu=0, tau=0.5, testval=1.0)

	# Duration is a deterministic variable
	duration_ = pm.SkewNormal('duration', alpha = alpha_skew, mu = mu_,
	sd = 1/tau_, observed = duration)
	# bokeh basics
	from bokeh.plotting import figure
	from bokeh.io import show, output_notebook

	# Create a blank figure with labels
	p = figure(plot_width = 600, plot_height = 600,
	title = 'Example Glyphs',
	x_axis_label = 'X', y_axis_label = 'Y')

	# Example data
	# Slider to select the binwidth, value is selected number
	binwidth_select = Slider(start = 1, end = 30,
	step = 1, value = 5,
	title = 'Delay Width (min)')
	# Update the plot when the value is changed
	binwidth_select.on_change('value', update)

	# RangeSlider to change the maximum and minimum values on histogram
	range_select = RangeSlider(start = -60, end = 180, value = (-60, 120),
	step = 5, title = 'Delay Range (min)')
	# Make a separate list for each airline
	x1 = list(flights[flights['name'] == 'United Air Lines Inc.']['arr_delay'])
	x2 = list(flights[flights['name'] == 'JetBlue Airways']['arr_delay'])
	x3 = list(flights[flights['name'] == 'ExpressJet Airlines Inc.']['arr_delay'])
	x4 = list(flights[flights['name'] == 'Delta Air Lines Inc.']['arr_delay'])
	x5 = list(flights[flights['name'] == 'American Airlines Inc.']['arr_delay'])

	# Assign colors for each airline and the names
	colors = ['#E69F00', '#56B4E9', '#F0E442', '#009E73', '#D55E00']
	names = ['United Air Lines Inc.', 'JetBlue Airways', 'ExpressJet Airlines Inc.'',
	# Subset to Alaska Airlines
	subset = flights[flights['name'] == 'Alaska Airlines Inc.']

	# Density Plot with Rug Plot
	sns.distplot(subset['arr_delay'], hist = False, kde = True, rug = True,
	color = 'darkblue',
	kde_kws={'linewidth': 3},
	rug_kws={'color': 'black'})

	# Plot formatting
	def format_data(df):
	# Targets are final grade of student
	labels = df['G3']
	# Drop the school and the grades from features
	df = df.drop(columns=['school', 'G1', 'G2', 'G3'])

	# One-Hot Encoding of Categorical Variables
	df = pd.get_dummies(df)

	df['y'] = list(labels)
	# Context for the model
	with pm.Model() as normal_model:

	# The prior for the model parameters will be a normal distribution
	family = pm.glm.families.Normal()

	# Making the model only requires specifying the formula and the data
	pm.GLM.from_formula(formula, X_train_math, family = family)

	# Perform Markov Chain Monte Carlo sampling
	# X_train is our training data, we will make a copy for plotting
	X_plot = X_train.copy()
	# Compare grades to the median
	X_plot['relation_median'] = (X_plot['Grade'] >= 12)
	X_plot['Grade'] = X_plot['Grade'].replace({True: 'above',
	False: 'below'})
	# Plot all variables in a loop
	plt.figure(figsize=(12, 12))
	for i, col in enumerate(X_plot.columns[:-1]):
	plt.subplot(3, 2, i + 1)
	# Replace the string for missing values with not a number
	data = data.replace({'Not Available': np.nan})

	# Iterate through the columns
	for col in list(data.columns):
	# Select columns that should be numeric using string matching
	if ('ft²' in col or 'kBtu' in col or 'Metric Tons CO2e' in col or 'kWh' in
	col or 'therms' in col or 'gal' in col or 'Score' in col):
	# Convert the data type to float
	data[col] = data[col].astype(float)