MAX10541/pandas-describe.py

## pandas-describe.py
# Note: some of the commets were removed, and a few were added.
# original source code: https://github.com/pandas-dev/pandas/blob/61f67b63ee8c01dad3858c3a01c34839d131d041/pandas/core/generic.py#L9986
# and: https://github.com/pandas-dev/pandas/blob/78d149897201cb55b403f6f84fc541a3d7aa86ad/pandas/core/groupby/groupby.py#L477

import pandas as pd
import numpy as np
from datetime import datetime
from contextlib import contextmanager
from pandas.core.generic import FrameOrSeries, format_percentiles, is_bool_dtype, is_numeric_dtype, validate_percentile
from typing import TYPE_CHECKING, Any, Callable, Dict, FrozenSet, Hashable, List, Mapping, Optional, Sequence, Set, Tuple, Type, Union, cast

# describe method for DataFrame
def describe_df(
        self: FrameOrSeries,
        percentiles=None,
        include=None,
        exclude=None,
        datetime_is_numeric=False,
    ) -> FrameOrSeries:
    if self.ndim == 2 and self.columns.size == 0:
        raise ValueError("Cannot describe a DataFrame without columns")

    if percentiles is not None:
        # explicit conversion of `percentiles` to list
        percentiles = list(percentiles)

        # get them all to be in [0, 1]
        validate_percentile(percentiles)

        # median should always be included
        if 0.5 not in percentiles:
            percentiles.append(0.5)
        percentiles = np.asarray(percentiles)
    else:
        percentiles = np.array([0.25, 0.5, 0.75])

    # sort and check for duplicates
    unique_pcts = np.unique(percentiles)
    if len(unique_pcts) < len(percentiles):
        raise ValueError("percentiles cannot contain duplicates")
    percentiles = unique_pcts

    formatted_percentiles = format_percentiles(percentiles)

    def describe_numeric_1d(series) -> "Series":
        stat_index = (
            ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
        )
        d = (
            [series.count(), series.mean(), series.std(), series.min()]
            + series.quantile(percentiles).tolist()
            + [series.max()]
        )
        return pd.Series(d, index=stat_index, name=series.name)

    def describe_categorical_1d(data) -> "Series":
        names = ["count", "unique"]
        objcounts = data.value_counts()
        count_unique = len(objcounts[objcounts != 0])
        result = [data.count(), count_unique]
        dtype = None
        if result[1] > 0:
            top, freq = objcounts.index[0], objcounts.iloc[0]
            if is_datetime64_any_dtype(data.dtype):
                if self.ndim == 1:
                    stacklevel = 4
                else:
                    stacklevel = 5
                warnings.warn(
                    "Treating datetime data as categorical rather than numeric in "
                    "`.describe` is deprecated and will be removed in a future "
                    "version of pandas. Specify `datetime_is_numeric=True` to "
                    "silence this warning and adopt the future behavior now.",
                    FutureWarning,
                    stacklevel=stacklevel,
                )
                tz = data.dt.tz
                asint = data.dropna().values.view("i8")
                top = Timestamp(top)
                if top.tzinfo is not None and tz is not None:
                    # Don't tz_localize(None) if key is already tz-aware
                    top = top.tz_convert(tz)
                else:
                    top = top.tz_localize(tz)
                names += ["top", "freq", "first", "last"]
                result += [
                    top,
                    freq,
                    Timestamp(asint.min(), tz=tz),
                    Timestamp(asint.max(), tz=tz),
                ]
            else:
                names += ["top", "freq"]
                result += [top, freq]

        # If the DataFrame is empty, set 'top' and 'freq' to None
        # to maintain output shape consistency
        else:
            names += ["top", "freq"]
            result += [np.nan, np.nan]
            dtype = "object"

        return pd.Series(result, index=names, name=data.name, dtype=dtype)

    def describe_timestamp_1d(data) -> "Series":
        # GH-30164
        stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
        d = (
            [data.count(), data.mean(), data.min()]
            + data.quantile(percentiles).tolist()
            + [data.max()]
        )
        return pd.Series(d, index=stat_index, name=data.name)

    def describe_1d(data) -> "Series":
        if is_bool_dtype(data.dtype):
            return describe_categorical_1d(data)
        elif is_numeric_dtype(data):
            return describe_numeric_1d(data)
        elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
            return describe_timestamp_1d(data)
        elif is_timedelta64_dtype(data.dtype):
            return describe_numeric_1d(data)
        else:
            return describe_categorical_1d(data)

    if self.ndim == 1:
        # Incompatible return value type
        #  (got "Series", expected "FrameOrSeries")  [return-value]
        return describe_1d(self)  # type:ignore[return-value]
    elif (include is None) and (exclude is None):
        # when some numerics are found, keep only numerics
        default_include = [np.number]
        if datetime_is_numeric:
            default_include.append("datetime")
        data = self.select_dtypes(include=default_include)
        if len(data.columns) == 0:
            data = self
    elif include == "all":
        if exclude is not None:
            msg = "exclude must be None when include is 'all'"
            raise ValueError(msg)
        data = self
    else:
        data = self.select_dtypes(include=include, exclude=exclude)

    ldesc = [describe_1d(s) for _, s in data.items()]

    names: List[Label] = []
    ldesc_indexes = sorted((x.index for x in ldesc), key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)

    d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
    d.columns = data.columns.copy()
    return d

@contextmanager
def group_selection_context(groupby):
    groupby._set_group_selection()
    try:
        yield groupby
    finally:
        groupby._reset_group_selection()

def describe_dfg(dfg, **kwargs):  # describe method for GroupByDataFrame
    with group_selection_context(dfg):
        result = dfg.apply(lambda x: describe_df(x, **kwargs))
        if dfg.axis == 1:
            return result.T
        return result.unstack()
	# Note: some of the commets were removed, and a few were added.
	# original source code: https://github.com/pandas-dev/pandas/blob/61f67b63ee8c01dad3858c3a01c34839d131d041/pandas/core/generic.py#L9986
	# and: https://github.com/pandas-dev/pandas/blob/78d149897201cb55b403f6f84fc541a3d7aa86ad/pandas/core/groupby/groupby.py#L477

	import pandas as pd
	import numpy as np
	from datetime import datetime
	from contextlib import contextmanager
	from pandas.core.generic import FrameOrSeries, format_percentiles, is_bool_dtype, is_numeric_dtype, validate_percentile
	from typing import TYPE_CHECKING, Any, Callable, Dict, FrozenSet, Hashable, List, Mapping, Optional, Sequence, Set, Tuple, Type, Union, cast

	# describe method for DataFrame
	def describe_df(
	self: FrameOrSeries,
	percentiles=None,
	include=None,
	exclude=None,
	datetime_is_numeric=False,
	) -> FrameOrSeries:
	if self.ndim == 2 and self.columns.size == 0:
	raise ValueError("Cannot describe a DataFrame without columns")

	if percentiles is not None:
	# explicit conversion of `percentiles` to list
	percentiles = list(percentiles)

	# get them all to be in [0, 1]
	validate_percentile(percentiles)

	# median should always be included
	if 0.5 not in percentiles:
	percentiles.append(0.5)
	percentiles = np.asarray(percentiles)
	else:
	percentiles = np.array([0.25, 0.5, 0.75])

	# sort and check for duplicates
	unique_pcts = np.unique(percentiles)
	if len(unique_pcts) < len(percentiles):
	raise ValueError("percentiles cannot contain duplicates")
	percentiles = unique_pcts

	formatted_percentiles = format_percentiles(percentiles)

	def describe_numeric_1d(series) -> "Series":
	stat_index = (
	["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
	)
	d = (
	[series.count(), series.mean(), series.std(), series.min()]
	+ series.quantile(percentiles).tolist()
	+ [series.max()]
	)
	return pd.Series(d, index=stat_index, name=series.name)

	def describe_categorical_1d(data) -> "Series":
	names = ["count", "unique"]
	objcounts = data.value_counts()
	count_unique = len(objcounts[objcounts != 0])
	result = [data.count(), count_unique]
	dtype = None
	if result[1] > 0:
	top, freq = objcounts.index[0], objcounts.iloc[0]
	if is_datetime64_any_dtype(data.dtype):
	if self.ndim == 1:
	stacklevel = 4
	else:
	stacklevel = 5
	warnings.warn(
	"Treating datetime data as categorical rather than numeric in "
	"`.describe` is deprecated and will be removed in a future "
	"version of pandas. Specify `datetime_is_numeric=True` to "
	"silence this warning and adopt the future behavior now.",
	FutureWarning,
	stacklevel=stacklevel,
	)
	tz = data.dt.tz
	asint = data.dropna().values.view("i8")
	top = Timestamp(top)
	if top.tzinfo is not None and tz is not None:
	# Don't tz_localize(None) if key is already tz-aware
	top = top.tz_convert(tz)
	else:
	top = top.tz_localize(tz)
	names += ["top", "freq", "first", "last"]
	result += [
	top,
	freq,
	Timestamp(asint.min(), tz=tz),
	Timestamp(asint.max(), tz=tz),
	]
	else:
	names += ["top", "freq"]
	result += [top, freq]

	# If the DataFrame is empty, set 'top' and 'freq' to None
	# to maintain output shape consistency
	else:
	names += ["top", "freq"]
	result += [np.nan, np.nan]
	dtype = "object"

	return pd.Series(result, index=names, name=data.name, dtype=dtype)

	def describe_timestamp_1d(data) -> "Series":
	# GH-30164
	stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
	d = (
	[data.count(), data.mean(), data.min()]
	+ data.quantile(percentiles).tolist()
	+ [data.max()]
	)
	return pd.Series(d, index=stat_index, name=data.name)

	def describe_1d(data) -> "Series":
	if is_bool_dtype(data.dtype):
	return describe_categorical_1d(data)
	elif is_numeric_dtype(data):
	return describe_numeric_1d(data)
	elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
	return describe_timestamp_1d(data)
	elif is_timedelta64_dtype(data.dtype):
	return describe_numeric_1d(data)
	else:
	return describe_categorical_1d(data)

	if self.ndim == 1:
	# Incompatible return value type
	# (got "Series", expected "FrameOrSeries") [return-value]
	return describe_1d(self) # type:ignore[return-value]
	elif (include is None) and (exclude is None):
	# when some numerics are found, keep only numerics
	default_include = [np.number]
	if datetime_is_numeric:
	default_include.append("datetime")
	data = self.select_dtypes(include=default_include)
	if len(data.columns) == 0:
	data = self
	elif include == "all":
	if exclude is not None:
	msg = "exclude must be None when include is 'all'"
	raise ValueError(msg)
	data = self
	else:
	data = self.select_dtypes(include=include, exclude=exclude)

	ldesc = [describe_1d(s) for _, s in data.items()]

	names: List[Label] = []
	ldesc_indexes = sorted((x.index for x in ldesc), key=len)
	for idxnames in ldesc_indexes:
	for name in idxnames:
	if name not in names:
	names.append(name)

	d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
	d.columns = data.columns.copy()
	return d

	@contextmanager
	def group_selection_context(groupby):
	groupby._set_group_selection()
	try:
	yield groupby
	finally:
	groupby._reset_group_selection()

	def describe_dfg(dfg, **kwargs): # describe method for GroupByDataFrame
	with group_selection_context(dfg):
	result = dfg.apply(lambda x: describe_df(x, **kwargs))
	if dfg.axis == 1:
	return result.T
	return result.unstack()