Skip to content

Instantly share code, notes, and snippets.

@joseph-allen
Created December 29, 2017 14:16
Show Gist options
  • Save joseph-allen/9e89e627915e6ea291cf06d3af928299 to your computer and use it in GitHub Desktop.
Save joseph-allen/9e89e627915e6ea291cf06d3af928299 to your computer and use it in GitHub Desktop.
Analysis Helpers
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
def plot_histograms(df, variables, n_rows, n_cols):
fig = plt.figure(figsize=(16, 12))
for i, var_name in enumerate(variables):
ax = fig.add_subplot(n_rows, n_cols, i+1)
df[var_name].hist(bins=10, ax=ax)
ax.set_title('Skew: ' + str(round(float(df[var_name].skew()), )))
ax.set_xticklabels([], visible=False)
ax.set_yticklabels([], visible=False)
fig.tight_layout() # Improves appearance a bit.
plt.show()
# good for plotting distributions
def plot_distribution(df, var, target, **kwargs):
row = kwargs.get('row', None)
col = kwargs.get('col', None)
facet = sns.FacetGrid(df, hue=target, aspect=4, row=row, col=col)
facet.map(sns.kdeplot, var, shade=True)
facet.set(xlim=(0, df[var].max()))
facet.add_legend()
# good for plotting categorical data
def plot_categories(df, cat, target, **kwargs):
row = kwargs.get('row', None)
col = kwargs.get('col', None)
facet = sns.FacetGrid(df, row=row, col=col)
facet.map(sns.barplot, cat, target)
facet.add_legend()
# plot correlations for seeing how similar features are
def plot_correlation_map(df):
corr = df.corr()
_, ax = plt.subplots(figsize=(12, 10))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
_ = sns.heatmap(
corr,
cmap=cmap,
square=True,
cbar_kws={'shrink': .9},
ax=ax,
annot=True,
annot_kws={'fontsize': 12}
)
def describe_more(df):
var = []
l = []
t = []
for x in df:
var.append(x)
l.append(len(pd.value_counts(df[x])))
t.append(df[x].dtypes)
levels = pd.DataFrame({'Variable': var, 'Levels': l, 'Datatype': t})
levels.sort_values(by='Levels', inplace=True)
return levels
def plot_variable_importance(X, y):
tree = DecisionTreeClassifier(random_state=99)
tree.fit(X, y)
plot_model_var_imp(tree, X, y)
def plot_model_var_imp(model, X, y):
imp = pd.DataFrame(
model.feature_importances_,
columns=['Importance'],
index=X.columns
)
imp = imp.sort_values(['Importance'], ascending=True)
imp[:10].plot(kind='barh')
print (model.score(X, y))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment