Skip to content

Instantly share code, notes, and snippets.


Will Koehrsen WillKoehrsen

View GitHub Profile
# Show 4 different binwidths
for i, binwidth in enumerate([1, 5, 10, 15]):
# Set up the plot
ax = plt.subplot(2, 2, i + 1)
# Draw the plot
ax.hist(flights['arr_delay'], bins = int(180/binwidth),
color = 'blue', edgecolor = 'black')
# Make a separate list for each airline
x1 = list(flights[flights['name'] == 'United Air Lines Inc.']['arr_delay'])
x2 = list(flights[flights['name'] == 'JetBlue Airways']['arr_delay'])
x3 = list(flights[flights['name'] == 'ExpressJet Airlines Inc.']['arr_delay'])
x4 = list(flights[flights['name'] == 'Delta Air Lines Inc.']['arr_delay'])
x5 = list(flights[flights['name'] == 'American Airlines Inc.']['arr_delay'])
# Assign colors for each airline and the names
colors = ['#E69F00', '#56B4E9', '#F0E442', '#009E73', '#D55E00']
names = ['United Air Lines Inc.', 'JetBlue Airways', 'ExpressJet Airlines Inc.'',
# List of five airlines to plot
airlines = ['United Air Lines Inc.', 'JetBlue Airways', 'ExpressJet Airlines Inc.'',
'Delta Air Lines Inc.', 'American Airlines Inc.']
# Iterate through the five airlines
for airline in airlines:
# Subset to the airline
subset = flights[flights['name'] == airline]
# Draw the density plot
# Subset to Alaska Airlines
subset = flights[flights['name'] == 'Alaska Airlines Inc.']
# Density Plot with Rug Plot
sns.distplot(subset['arr_delay'], hist = False, kde = True, rug = True,
color = 'darkblue',
kde_kws={'linewidth': 3},
rug_kws={'color': 'black'})
# Plot formatting
# Import the libraries
import matplotlib.pyplot as plt
import seaborn as sns
# matplotlib histogram
plt.hist(flights['arr_delay'], color = 'blue', edgecolor = 'black',
bins = int(180/5))
# seaborn histogram
sns.distplot(flights['arr_delay'], hist=True, kde=False,
# Pandas for data management
import pandas as pd
# os methods for manipulating paths
from os.path import dirname, join
# Bokeh basics
from import curdoc
from bokeh.models.widgets import Tabs
# Function to calculate correlation coefficient between two arrays
def corr(x, y, **kwargs):
# Calculate the value
coef = np.corrcoef(x, y)[0][1]
# Make the label
label = r'$\rho$ = ' + str(round(coef, 2))
# Add the label to the plot
ax = plt.gca()
def format_data(df):
# Targets are final grade of student
labels = df['G3']
# Drop the school and the grades from features
df = df.drop(columns=['school', 'G1', 'G2', 'G3'])
# One-Hot Encoding of Categorical Variables
df = pd.get_dummies(df)
df['y'] = list(labels)
# Context for the model
with pm.Model() as normal_model:
# The prior for the model parameters will be a normal distribution
family = pm.glm.families.Normal()
# Making the model only requires specifying the formula and the data
pm.GLM.from_formula(formula, X_train_math, family = family)
# Perform Markov Chain Monte Carlo sampling
import pymc3 as pm
# Context for the model
with pm.Model() as normal_model:
# The prior for the data likelihood is a Normal Distribution
family = pm.glm.families.Normal()
# Creating the model requires a formula and data (and optionally a family)
pm.GLM.from_formula(formula, data = X_train, family = family)