Last active
November 29, 2020 17:55
-
-
Save MAX10541/ee7bbcbcc6def831b461ccdfe7154195 to your computer and use it in GitHub Desktop.
A "self-contained" version of pandas `describe` method. Used to help answer this SO question: https://stackoverflow.com/questions/51014376/pandas-dataframe-groupby-describe-8x-slower-than-computing-separatly
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Note: some of the commets were removed, and a few were added. | |
# original source code: https://github.com/pandas-dev/pandas/blob/61f67b63ee8c01dad3858c3a01c34839d131d041/pandas/core/generic.py#L9986 | |
# and: https://github.com/pandas-dev/pandas/blob/78d149897201cb55b403f6f84fc541a3d7aa86ad/pandas/core/groupby/groupby.py#L477 | |
import pandas as pd | |
import numpy as np | |
from datetime import datetime | |
from contextlib import contextmanager | |
from pandas.core.generic import FrameOrSeries, format_percentiles, is_bool_dtype, is_numeric_dtype, validate_percentile | |
from typing import TYPE_CHECKING, Any, Callable, Dict, FrozenSet, Hashable, List, Mapping, Optional, Sequence, Set, Tuple, Type, Union, cast | |
# describe method for DataFrame | |
def describe_df( | |
self: FrameOrSeries, | |
percentiles=None, | |
include=None, | |
exclude=None, | |
datetime_is_numeric=False, | |
) -> FrameOrSeries: | |
if self.ndim == 2 and self.columns.size == 0: | |
raise ValueError("Cannot describe a DataFrame without columns") | |
if percentiles is not None: | |
# explicit conversion of `percentiles` to list | |
percentiles = list(percentiles) | |
# get them all to be in [0, 1] | |
validate_percentile(percentiles) | |
# median should always be included | |
if 0.5 not in percentiles: | |
percentiles.append(0.5) | |
percentiles = np.asarray(percentiles) | |
else: | |
percentiles = np.array([0.25, 0.5, 0.75]) | |
# sort and check for duplicates | |
unique_pcts = np.unique(percentiles) | |
if len(unique_pcts) < len(percentiles): | |
raise ValueError("percentiles cannot contain duplicates") | |
percentiles = unique_pcts | |
formatted_percentiles = format_percentiles(percentiles) | |
def describe_numeric_1d(series) -> "Series": | |
stat_index = ( | |
["count", "mean", "std", "min"] + formatted_percentiles + ["max"] | |
) | |
d = ( | |
[series.count(), series.mean(), series.std(), series.min()] | |
+ series.quantile(percentiles).tolist() | |
+ [series.max()] | |
) | |
return pd.Series(d, index=stat_index, name=series.name) | |
def describe_categorical_1d(data) -> "Series": | |
names = ["count", "unique"] | |
objcounts = data.value_counts() | |
count_unique = len(objcounts[objcounts != 0]) | |
result = [data.count(), count_unique] | |
dtype = None | |
if result[1] > 0: | |
top, freq = objcounts.index[0], objcounts.iloc[0] | |
if is_datetime64_any_dtype(data.dtype): | |
if self.ndim == 1: | |
stacklevel = 4 | |
else: | |
stacklevel = 5 | |
warnings.warn( | |
"Treating datetime data as categorical rather than numeric in " | |
"`.describe` is deprecated and will be removed in a future " | |
"version of pandas. Specify `datetime_is_numeric=True` to " | |
"silence this warning and adopt the future behavior now.", | |
FutureWarning, | |
stacklevel=stacklevel, | |
) | |
tz = data.dt.tz | |
asint = data.dropna().values.view("i8") | |
top = Timestamp(top) | |
if top.tzinfo is not None and tz is not None: | |
# Don't tz_localize(None) if key is already tz-aware | |
top = top.tz_convert(tz) | |
else: | |
top = top.tz_localize(tz) | |
names += ["top", "freq", "first", "last"] | |
result += [ | |
top, | |
freq, | |
Timestamp(asint.min(), tz=tz), | |
Timestamp(asint.max(), tz=tz), | |
] | |
else: | |
names += ["top", "freq"] | |
result += [top, freq] | |
# If the DataFrame is empty, set 'top' and 'freq' to None | |
# to maintain output shape consistency | |
else: | |
names += ["top", "freq"] | |
result += [np.nan, np.nan] | |
dtype = "object" | |
return pd.Series(result, index=names, name=data.name, dtype=dtype) | |
def describe_timestamp_1d(data) -> "Series": | |
# GH-30164 | |
stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] | |
d = ( | |
[data.count(), data.mean(), data.min()] | |
+ data.quantile(percentiles).tolist() | |
+ [data.max()] | |
) | |
return pd.Series(d, index=stat_index, name=data.name) | |
def describe_1d(data) -> "Series": | |
if is_bool_dtype(data.dtype): | |
return describe_categorical_1d(data) | |
elif is_numeric_dtype(data): | |
return describe_numeric_1d(data) | |
elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric: | |
return describe_timestamp_1d(data) | |
elif is_timedelta64_dtype(data.dtype): | |
return describe_numeric_1d(data) | |
else: | |
return describe_categorical_1d(data) | |
if self.ndim == 1: | |
# Incompatible return value type | |
# (got "Series", expected "FrameOrSeries") [return-value] | |
return describe_1d(self) # type:ignore[return-value] | |
elif (include is None) and (exclude is None): | |
# when some numerics are found, keep only numerics | |
default_include = [np.number] | |
if datetime_is_numeric: | |
default_include.append("datetime") | |
data = self.select_dtypes(include=default_include) | |
if len(data.columns) == 0: | |
data = self | |
elif include == "all": | |
if exclude is not None: | |
msg = "exclude must be None when include is 'all'" | |
raise ValueError(msg) | |
data = self | |
else: | |
data = self.select_dtypes(include=include, exclude=exclude) | |
ldesc = [describe_1d(s) for _, s in data.items()] | |
names: List[Label] = [] | |
ldesc_indexes = sorted((x.index for x in ldesc), key=len) | |
for idxnames in ldesc_indexes: | |
for name in idxnames: | |
if name not in names: | |
names.append(name) | |
d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False) | |
d.columns = data.columns.copy() | |
return d | |
@contextmanager | |
def group_selection_context(groupby): | |
groupby._set_group_selection() | |
try: | |
yield groupby | |
finally: | |
groupby._reset_group_selection() | |
def describe_dfg(dfg, **kwargs): # describe method for GroupByDataFrame | |
with group_selection_context(dfg): | |
result = dfg.apply(lambda x: describe_df(x, **kwargs)) | |
if dfg.axis == 1: | |
return result.T | |
return result.unstack() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment