Skip to content

Instantly share code, notes, and snippets.

@dkapitan
Last active November 1, 2021 08:54
Show Gist options
  • Save dkapitan/81dbd97f3638aed5ff843261dba6d84e to your computer and use it in GitHub Desktop.
Save dkapitan/81dbd97f3638aed5ff843261dba6d84e to your computer and use it in GitHub Desktop.
Monkey-patch for pd.Dataframe.describe() with robust statistics
def describe_robust(self, percentiles=None, include=None, exclude=None, trim=0.2):
"""
Monkey-patch for pd.Dataframe.describe based on robust statistics.
Calculate trimmed mean and winsorized standard deviation with default trim 0.2.
Uses scipy.stats.mstats (trimmed_mean, winsorized) and numpy.std
See e.g. http://www.uh.edu/~ttian/ES.pdf for methodical background.
BSD 3-Clause License
Copyright (c) 2018, Daniel Kapitan (daniel@kapitan.net)
All rights reserved.
----------
percentiles : list-like of numbers, optional
The percentiles to include in the output. All should
fall between 0 and 1. The default is
``[.25, .5, .75]``, which returns the 25th, 50th, and
75th percentiles.
include : 'all', list-like of dtypes or None (default), optional
A white list of data types to include in the result. Ignored
for ``Series``. Here are the options:
- 'all' : All columns of the input will be included in the output.
- A list-like of dtypes : Limits the results to the
provided data types.
To limit the result to numeric types submit
``numpy.number``. To limit it instead to object columns submit
the ``numpy.object`` data type. Strings
can also be used in the style of
``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
select pandas categorical columns, use ``'category'``
- None (default) : The result will include all numeric columns.
exclude : list-like of dtypes or None (default), optional,
A black list of data types to omit from the result. Ignored
for ``Series``. Here are the options:
- A list-like of dtypes : Excludes the provided data types
from the result. To exclude numeric types submit
``numpy.number``. To exclude object columns submit the data
type ``numpy.object``. Strings can also be used in the style of
``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
exclude pandas categorical columns, use ``'category'``
- None (default) : The result will exclude nothing.
trim : fraction to trim observation on both sides. 0.2 by default.
Returns
-------
summary: Series/DataFrame of robust summary statistics
"""
from pandas.io.formats.format import format_percentiles
from pandas.core.dtypes.common import (
is_bool_dtype,
is_numeric_dtype,
is_datetime64_dtype)
from scipy.stats.mstats import trimmed_mean, winsorize
import numpy as np
import pandas as pd
if self.ndim >= 3:
msg = "describe is not implemented on Panel objects."
raise NotImplementedError(msg)
elif self.ndim == 2 and self.columns.size == 0:
raise ValueError("Cannot describe a DataFrame without columns")
if percentiles is not None:
# explicit conversion of `percentiles` to list
percentiles = list(percentiles)
# get them all to be in [0, 1]
self._check_percentile(percentiles)
# median should always be included
if 0.5 not in percentiles:
percentiles.append(0.5)
percentiles = np.asarray(percentiles)
else:
percentiles = np.array([0.25, 0.5, 0.75])
# sort and check for duplicates
unique_pcts = np.unique(percentiles)
if len(unique_pcts) < len(percentiles):
raise ValueError("percentiles cannot contain duplicates")
percentiles = unique_pcts
formatted_percentiles = format_percentiles(percentiles)
def describe_numeric_1d_robust(series):
stat_index = (['count', 'trim_mean', 'trim_std', 'min'] +
formatted_percentiles + ['max'])
d = ([series.count(),
trimmed_mean(series.dropna(), limits=trim),
np.std(winsorize(series.dropna(), limits=trim)),
series.min()] +
[series.quantile(x) for x in percentiles] + [series.max()])
return pd.Series(d, index=stat_index, name=series.name)
def describe_categorical_1d(data):
names = ['count', 'unique']
objcounts = data.value_counts()
count_unique = len(objcounts[objcounts != 0])
result = [data.count(), count_unique]
if result[1] > 0:
top, freq = objcounts.index[0], objcounts.iloc[0]
if is_datetime64_dtype(data):
asint = data.dropna().values.view('i8')
names += ['top', 'freq', 'first', 'last']
result += [tslib.Timestamp(top), freq,
tslib.Timestamp(asint.min()),
tslib.Timestamp(asint.max())]
else:
names += ['top', 'freq']
result += [top, freq]
return pd.Series(result, index=names, name=data.name)
def describe_1d(data):
if is_bool_dtype(data):
return describe_categorical_1d(data)
elif is_numeric_dtype(data):
return describe_numeric_1d_robust(data)
elif is_timedelta64_dtype(data):
return describe_numeric_1d_robust(data)
else:
return describe_categorical_1d(data)
if self.ndim == 1:
return describe_1d(self)
elif (include is None) and (exclude is None):
# when some numerics are found, keep only numerics
data = self.select_dtypes(include=[np.number])
if len(data.columns) == 0:
data = self
elif include == 'all':
if exclude is not None:
msg = "exclude must be None when include is 'all'"
raise ValueError(msg)
data = self
else:
data = self.select_dtypes(include=include, exclude=exclude)
ldesc = [describe_1d(s) for _, s in data.iteritems()]
# set a convenient order for rows
names = []
ldesc_indexes = sorted([x.index for x in ldesc], key=len)
for idxnames in ldesc_indexes:
for name in idxnames:
if name not in names:
names.append(name)
d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
d.columns = data.columns.copy()
return d
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment