Skip to content

Instantly share code, notes, and snippets.

@sebp
Created March 20, 2019 14:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sebp/9c718d4f031a12378051a8581ba7960e to your computer and use it in GitHub Desktop.
Save sebp/9c718d4f031a12378051a8581ba7960e to your computer and use it in GitHub Desktop.
Plot missing values.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
def index_to_binary_matrix(index):
return index.to_series().apply(lambda x: pd.Series(map(int, x.split(":")), name=x))
class MissingValuesPlot:
"""Plot missing values"""
def __init__(self, data, label=None, sort_by_missing=True):
"""
:type data: pandas.DataFrame
:param label: Label of the data
:type label: str
:param sort_by_missing: Whether to sort columns by
amount of missingess
"""
self.data = data
self.label = label
self.sort_by_missing = sort_by_missing
def get_missings(self):
"""
:rtype: pandas.Series
"""
miss = self.data.apply(lambda x: x.isnull().sum(), reduce=True, axis=0)
return miss
def get_combinations(self):
"""
:rtype: pandas.DataFrame
"""
miss_patterns = self.data.isnull().astype(np.int8).apply(
lambda x: ":".join([str(v) for v in x]),
reduce=True, axis=1)
counts = miss_patterns.value_counts()
percentage = counts * 100 / self.data.shape[0]
df = pd.DataFrame({"Count": counts, "Percent": percentage})
assert df["Count"].sum() == self.data.shape[0]
return df
def plot_combinations(self, max_patterns=None, **fig_kw):
"""Plot combinations of missing values"""
miss = self.get_missings()
combs = self.get_combinations()
if max_patterns is not None:
combs = combs.head(max_patterns)
binary_mat = index_to_binary_matrix(combs.index)
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True,
gridspec_kw={"width_ratios": [4, 1], "wspace": 0.01,
"bottom": 0.25, "top": 0.96},
**fig_kw)
if self.sort_by_missing:
o = np.argsort(-miss.values)
data = binary_mat.iloc[::-1, o]
xticklabels = miss.iloc[o].index.tolist()
else:
data = binary_mat.iloc[::-1, :]
xticklabels = miss.index.tolist()
sns.heatmap(data, xticklabels=xticklabels,
cbar=False, yticklabels=True, linewidths=.1,
ax=ax1)
ax1.set_title("Combinations")
for t in ax1.get_xticklabels():
t.set_rotation('vertical')
if self.label is not None:
ax1.set_xlabel(self.label)
pos = ax1.get_yticks()
percent = combs["Percent"]
ax2.barh(pos, width=percent, height=.9, align="center")
ax2.yaxis.grid(False)
ax1.yaxis.set_visible(False)
ax2.yaxis.set_visible(False)
ax2.set_xlabel("Percent")
ax3 = ax2.twinx()
ax3.set_yticks(pos)
ax3.set_ylim(ax2.get_ylim())
ax3.set_yticklabels(["%.1f%%" % v for v in percent])
ax3.yaxis.grid(False)
return fig
def plot_percentage(self, **kwargs):
"""Plot percentage of missingness"""
miss = self.get_missings()
df_percentage = miss / self.data.shape[0]
if self.sort_by_missing:
data = df_percentage.sort_values(ascending=False)
else:
data = df_percentage
ax = data.plot.bar(**kwargs)
plt.subplots_adjust(bottom=0.25, top=0.96)
ax.set_ylabel("Proportion of missings")
if self.label is not None:
ax.set_xlabel(self.label)
return ax
def drop_completely_missing(data):
perc_missing = data.isnull().sum(axis=1) / data.shape[1]
avail_data = data[perc_missing != 1]
return avail_data.copy()
def get_top_missing_pattern(data):
"""
:type data: pandas.DataFrame
:rtype pandas.Series
"""
mvp = MissingValuesPlot(data)
all_combs = mvp.get_combinations()
top_comb = all_combs.head(1).copy()
binary_mat = index_to_binary_matrix(top_comb.index)
top_comb["Missing"] = binary_mat.sum(axis=1)
complete_index = ":".join(["0"] * binary_mat.shape[1])
if complete_index in all_combs.index:
complete = all_combs.loc[complete_index, :]
else:
complete = pd.Series([0, 0], index=["Count, Percent"])
top_comb["Complete cases"] = complete["Count"]
top_comb["Complete cases (percentage)"] = complete["Percent"]
top_feature = mvp.get_missings().sort_values(ascending=False).head(1)
top_comb["Feature"] = top_feature.index.format()[0]
top_comb["Feature Missing"] = top_feature.iloc[0] * 100 / data.shape[0]
return top_comb
def plot_missing_values_as_pdf(data, name, sort_by_missing=True, max_patterns=None,
xlabel=None, figsize=None):
"""
:type data: pandas.DataFrame
:type name: str
:param sort_by_missing: Whether to sort columns by amount of missingess
:param max_patterns: Maximum number of combinations to plot
:param figsize: Size of the figure
:type figsize: tuple
"""
with PdfPages("%s.pdf" % name) as pg:
mvp = MissingValuesPlot(data, sort_by_missing=sort_by_missing, label=xlabel)
mvp.plot_combinations(max_patterns=max_patterns, figsize=figsize)
pg.savefig(bbox_inches="tight")
plt.close()
plt.figure(figsize=figsize)
mvp.plot_percentage()
pg.savefig(bbox_inches="tight")
plt.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment