Created August 17, 2021 15:19
Explore in a Box (Maggie's module)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from scipy import stats
def train_validate_test_split(df, target, seed=123):
This function takes in a dataframe, the name of the target variable
(for stratification purposes), and an integer for a setting a seed
and splits the data into train, validate and test.
Test is 20% of the original dataset, validate is .30*.80= 24% of the
original dataset, and train is .70*.80= 56% of the original dataset.
The function returns, in this order, train, validate and test dataframes.
train_validate, test = train_test_split(df, test_size=0.2,
train, validate = train_test_split(train_validate, test_size=0.3,
return train, validate, test
def explore_univariate(train, cat_vars, quant_vars):
for var in cat_vars:
explore_univariate_categorical(train, var)
for col in quant_vars:
p, descriptive_stats = explore_univariate_quant(train, col)
def explore_bivariate(train, target, cat_vars, quant_vars):
for cat in cat_vars:
explore_bivariate_categorical(train, target, cat)
for quant in quant_vars:
explore_bivariate_quant(train, target, quant)
def explore_multivariate(train, target, cat_vars, quant_vars):
plot_swarm_grid_with_color(train, target, cat_vars, quant_vars)
violin = plot_violin_grid_with_color(train, target, cat_vars, quant_vars)
pair = sns.pairplot(data=train, vars=quant_vars, hue=target)
plot_all_continuous_vars(train, target, quant_vars)
### Univariate
def explore_univariate_categorical(train, cat_var):
takes in a dataframe and a categorical variable and returns
a frequency table and barplot of the frequencies.
frequency_table = freq_table(train, cat_var)
sns.barplot(x=cat_var, y='Count', data=frequency_table, color='lightseagreen')
def explore_univariate_quant(train, quant_var):
takes in a dataframe and a quantitative variable and returns
descriptive stats table, histogram, and boxplot of the distributions.
descriptive_stats = train[quant_var].describe()
p = plt.subplot(1, 2, 1)
p = plt.hist(train[quant_var], color='lightseagreen')
p = plt.title(quant_var)
# second plot: box plot
p = plt.subplot(1, 2, 2)
p = plt.boxplot(train[quant_var])
p = plt.title(quant_var)
return p, descriptive_stats
def freq_table(train, cat_var):
for a given categorical variable, compute the frequency count and percent split
and return a dataframe of those values along with the different classes.
class_labels = list(train[cat_var].unique())
frequency_table = (
pd.DataFrame({cat_var: class_labels,
'Count': train[cat_var].value_counts(normalize=False),
'Percent': round(train[cat_var].value_counts(normalize=True)*100,2)}
return frequency_table
#### Bivariate
def explore_bivariate_categorical(train, target, cat_var):
takes in categorical variable and binary target variable,
returns a crosstab of frequencies
runs a chi-square test for the proportions
and creates a barplot, adding a horizontal line of the overall rate of the target.
print(cat_var, "\n_____________________\n")
ct = pd.crosstab(train[cat_var], train[target], margins=True)
chi2_summary, observed, expected = run_chi2(train, cat_var, target)
p = plot_cat_by_target(train, target, cat_var)
print("\nobserved:\n", ct)
print("\nexpected:\n", expected)
def explore_bivariate_quant(train, target, quant_var):
descriptive stats by each target class.
compare means across 2 target groups
boxenplot of target x quant
swarmplot of target x quant
print(quant_var, "\n____________________\n")
descriptive_stats = train.groupby(target)[quant_var].describe()
average = train[quant_var].mean()
mann_whitney = compare_means(train, target, quant_var)
boxen = plot_boxen(train, target, quant_var)
swarm = plot_swarm(train, target, quant_var)
print(descriptive_stats, "\n")
print("\nMann-Whitney Test:\n", mann_whitney)
## Bivariate Categorical
def run_chi2(train, cat_var, target):
observed = pd.crosstab(train[cat_var], train[target])
chi2, p, degf, expected = stats.chi2_contingency(observed)
chi2_summary = pd.DataFrame({'chi2': [chi2], 'p-value': [p],
'degrees of freedom': [degf]})
expected = pd.DataFrame(expected)
return chi2_summary, observed, expected
def plot_cat_by_target(train, target, cat_var):
p = plt.figure(figsize=(2,2))
p = sns.barplot(cat_var, target, data=train, alpha=.8, color='lightseagreen')
overall_rate = train[target].mean()
p = plt.axhline(overall_rate, ls='--', color='gray')
return p
## Bivariate Quant
def plot_swarm(train, target, quant_var):
average = train[quant_var].mean()
p = sns.swarmplot(data=train, x=target, y=quant_var, color='lightgray')
p = plt.title(quant_var)
p = plt.axhline(average, ls='--', color='black')
return p
def plot_boxen(train, target, quant_var):
average = train[quant_var].mean()
p = sns.boxenplot(data=train, x=target, y=quant_var, color='lightseagreen')
p = plt.title(quant_var)
p = plt.axhline(average, ls='--', color='black')
return p
# alt_hyp = ‘two-sided’, ‘less’, ‘greater’
def compare_means(train, target, quant_var, alt_hyp='two-sided'):
x = train[train[target]==0][quant_var]
y = train[train[target]==1][quant_var]
return stats.mannwhitneyu(x, y, use_continuity=True, alternative=alt_hyp)
### Multivariate
def plot_all_continuous_vars(train, target, quant_vars):
Melt the dataset to "long-form" representation
boxenplot of measurement x value with color representing the target variable.
my_vars = [item for sublist in [quant_vars, [target]] for item in sublist]
sns.set(style="whitegrid", palette="muted")
melt = train[my_vars].melt(id_vars=target, var_name="measurement")
p = sns.boxenplot(x="measurement", y="value", hue=target, data=melt)
p.set(yscale="log", xlabel='')
def plot_violin_grid_with_color(train, target, cat_vars, quant_vars):
cols = len(cat_vars)
for quant in quant_vars:
_, ax = plt.subplots(nrows=1, ncols=cols, figsize=(16, 4), sharey=True)
for i, cat in enumerate(cat_vars):
sns.violinplot(x=cat, y=quant, data=train, split=True,
ax=ax[i], hue=target, palette="Set2")
def plot_swarm_grid_with_color(train, target, cat_vars, quant_vars):
cols = len(cat_vars)
for quant in quant_vars:
_, ax = plt.subplots(nrows=1, ncols=cols, figsize=(16, 4), sharey=True)
for i, cat in enumerate(cat_vars):
sns.swarmplot(x=cat, y=quant, data=train, ax=ax[i], hue=target, palette="Set2")
