Skip to content

Instantly share code, notes, and snippets.

@MAX10541
Last active November 29, 2020 17:55
Show Gist options
  • Save MAX10541/ee7bbcbcc6def831b461ccdfe7154195 to your computer and use it in GitHub Desktop.
Save MAX10541/ee7bbcbcc6def831b461ccdfe7154195 to your computer and use it in GitHub Desktop.
A "self-contained" version of pandas `describe` method. Used to help answer this SO question: https://stackoverflow.com/questions/51014376/pandas-dataframe-groupby-describe-8x-slower-than-computing-separatly
# Note: some of the commets were removed, and a few were added.
# original source code: https://github.com/pandas-dev/pandas/blob/61f67b63ee8c01dad3858c3a01c34839d131d041/pandas/core/generic.py#L9986
# and: https://github.com/pandas-dev/pandas/blob/78d149897201cb55b403f6f84fc541a3d7aa86ad/pandas/core/groupby/groupby.py#L477
import pandas as pd
import numpy as np
from datetime import datetime
from contextlib import contextmanager
from pandas.core.generic import FrameOrSeries, format_percentiles, is_bool_dtype, is_numeric_dtype, validate_percentile
from typing import TYPE_CHECKING, Any, Callable, Dict, FrozenSet, Hashable, List, Mapping, Optional, Sequence, Set, Tuple, Type, Union, cast
# describe method for DataFrame
def describe_df(
self: FrameOrSeries,
percentiles=None,
include=None,
exclude=None,
datetime_is_numeric=False,
) -> FrameOrSeries:
if self.ndim == 2 and self.columns.size == 0:
raise ValueError("Cannot describe a DataFrame without columns")
if percentiles is not None:
# explicit conversion of `percentiles` to list
percentiles = list(percentiles)
# get them all to be in [0, 1]
validate_percentile(percentiles)
# median should always be included
if 0.5 not in percentiles:
percentiles.append(0.5)
percentiles = np.asarray(percentiles)
else:
percentiles = np.array([0.25, 0.5, 0.75])
# sort and check for duplicates
unique_pcts = np.unique(percentiles)
if len(unique_pcts) < len(percentiles):
raise ValueError("percentiles cannot contain duplicates")
percentiles = unique_pcts
formatted_percentiles = format_percentiles(percentiles)
def describe_numeric_1d(series) -> "Series":
stat_index = (
["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
)
d = (
[series.count(), series.mean(), series.std(), series.min()]
+ series.quantile(percentiles).tolist()
+ [series.max()]
)
return pd.Series(d, index=stat_index, name=series.name)
def describe_categorical_1d(data) -> "Series":
names = ["count", "unique"]
objcounts = data.value_counts()
count_unique = len(objcounts[objcounts != 0])
result = [data.count(), count_unique]
dtype = None
if result[1] > 0:
top, freq = objcounts.index[0], objcounts.iloc[0]
if is_datetime64_any_dtype(data.dtype):
if self.ndim == 1:
stacklevel = 4
else:
stacklevel = 5
warnings.warn(
"Treating datetime data as categorical rather than numeric in "
"`.describe` is deprecated and will be removed in a future "
"version of pandas. Specify `datetime_is_numeric=True` to "
"silence this warning and adopt the future behavior now.",
FutureWarning,
stacklevel=stacklevel,
)
tz = data.dt.tz
asint = data.dropna().values.view("i8")
top = Timestamp(top)
if top.tzinfo is not None and tz is not None:
# Don't tz_localize(None) if key is already tz-aware
top = top.tz_convert(tz)
else:
top = top.tz_localize(tz)
names += ["top", "freq", "first", "last"]
result += [
top,
freq,
Timestamp(asint.min(), tz=tz),
Timestamp(asint.max(), tz=tz),
]
else:
names += ["top", "freq"]
result += [top, freq]
# If the DataFrame is empty, set 'top' and 'freq' to None
# to maintain output shape consistency
else:
names += ["top", "freq"]
result += [np.nan, np.nan]
dtype = "object"
return pd.Series(result, index=names, name=data.name, dtype=dtype)
def describe_timestamp_1d(data) -> "Series":
# GH-30164
stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
d = (
[data.count(), data.mean(), data.min()]
+ data.quantile(percentiles).tolist()
+ [data.max()]
)
return pd.Series(d, index=stat_index, name=data.name)
def describe_1d(data) -> "Series":
if is_bool_dtype(data.dtype):
return describe_categorical_1d(data)
elif is_numeric_dtype(data):
return describe_numeric_1d(data)
elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
return describe_timestamp_1d(data)
elif is_timedelta64_dtype(data.dtype):
return describe_numeric_1d(data)
else:
return describe_categorical_1d(data)
if self.ndim == 1:
# Incompatible return value type
# (got "Series", expected "FrameOrSeries") [return-value]
return describe_1d(self) # type:ignore[return-value]
elif (include is None) and (exclude is None):
# when some numerics are found, keep only numerics
default_include = [np.number]
if datetime_is_numeric:
default_include.append("datetime")
data = self.select_dtypes(include=default_include)
if len(data.columns) == 0:
data = self
elif include == "all":
if exclude is not None:
msg = "exclude must be None when include is 'all'"
raise ValueError(msg)
data = self
else:
data = self.select_dtypes(include=include, exclude=exclude)
ldesc = [describe_1d(s) for _, s in data.items()]
names: List[Label] = []
ldesc_indexes = sorted((x.index for x in ldesc), key=len)
for idxnames in ldesc_indexes:
for name in idxnames:
if name not in names:
names.append(name)
d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
d.columns = data.columns.copy()
return d
@contextmanager
def group_selection_context(groupby):
groupby._set_group_selection()
try:
yield groupby
finally:
groupby._reset_group_selection()
def describe_dfg(dfg, **kwargs): # describe method for GroupByDataFrame
with group_selection_context(dfg):
result = dfg.apply(lambda x: describe_df(x, **kwargs))
if dfg.axis == 1:
return result.T
return result.unstack()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment