dkapitan/describe_robust.py

## describe_robust.py
def describe_robust(self, percentiles=None, include=None, exclude=None, trim=0.2):
    """
    Monkey-patch for pd.Dataframe.describe based on robust statistics.
    Calculate trimmed mean and winsorized standard deviation with default trim 0.2.
    Uses scipy.stats.mstats (trimmed_mean, winsorized) and numpy.std

    See e.g. http://www.uh.edu/~ttian/ES.pdf for methodical background.

    BSD 3-Clause License

    Copyright (c) 2018, Daniel Kapitan (daniel@kapitan.net)
    All rights reserved.

    ----------
    percentiles : list-like of numbers, optional
        The percentiles to include in the output. All should
        fall between 0 and 1. The default is
        ``[.25, .5, .75]``, which returns the 25th, 50th, and
        75th percentiles.
    include : 'all', list-like of dtypes or None (default), optional
        A white list of data types to include in the result. Ignored
        for ``Series``. Here are the options:

        - 'all' : All columns of the input will be included in the output.
        - A list-like of dtypes : Limits the results to the
          provided data types.
          To limit the result to numeric types submit
          ``numpy.number``. To limit it instead to object columns submit
          the ``numpy.object`` data type. Strings
          can also be used in the style of
          ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
          select pandas categorical columns, use ``'category'``
        - None (default) : The result will include all numeric columns.
    exclude : list-like of dtypes or None (default), optional,
        A black list of data types to omit from the result. Ignored
        for ``Series``. Here are the options:

        - A list-like of dtypes : Excludes the provided data types
          from the result. To exclude numeric types submit
          ``numpy.number``. To exclude object columns submit the data
          type ``numpy.object``. Strings can also be used in the style of
          ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
          exclude pandas categorical columns, use ``'category'``
        - None (default) : The result will exclude nothing.
    trim : fraction to trim observation on both sides. 0.2 by default.

    Returns
    -------
    summary:  Series/DataFrame of robust summary statistics


    """
    from pandas.io.formats.format import format_percentiles
    from pandas.core.dtypes.common import (
            is_bool_dtype,
            is_numeric_dtype,
            is_datetime64_dtype)
    from scipy.stats.mstats import trimmed_mean, winsorize
    import numpy as np
    import pandas as pd


    if self.ndim >= 3:
        msg = "describe is not implemented on Panel objects."
        raise NotImplementedError(msg)
    elif self.ndim == 2 and self.columns.size == 0:
        raise ValueError("Cannot describe a DataFrame without columns")

    if percentiles is not None:
        # explicit conversion of `percentiles` to list
        percentiles = list(percentiles)

        # get them all to be in [0, 1]
        self._check_percentile(percentiles)

       # median should always be included
        if 0.5 not in percentiles:
            percentiles.append(0.5)
        percentiles = np.asarray(percentiles)
    else:
        percentiles = np.array([0.25, 0.5, 0.75])

    # sort and check for duplicates
    unique_pcts = np.unique(percentiles)
    if len(unique_pcts) < len(percentiles):
        raise ValueError("percentiles cannot contain duplicates")
    percentiles = unique_pcts
    formatted_percentiles = format_percentiles(percentiles)


    def describe_numeric_1d_robust(series):
        stat_index = (['count', 'trim_mean', 'trim_std', 'min'] +
                      formatted_percentiles + ['max'])
        d = ([series.count(),
              trimmed_mean(series.dropna(), limits=trim),
              np.std(winsorize(series.dropna(), limits=trim)),
              series.min()] +
             [series.quantile(x) for x in percentiles] + [series.max()])
        return pd.Series(d, index=stat_index, name=series.name)


    def describe_categorical_1d(data):
        names = ['count', 'unique']
        objcounts = data.value_counts()
        count_unique = len(objcounts[objcounts != 0])
        result = [data.count(), count_unique]
        if result[1] > 0:
            top, freq = objcounts.index[0], objcounts.iloc[0]

            if is_datetime64_dtype(data):
                asint = data.dropna().values.view('i8')
                names += ['top', 'freq', 'first', 'last']
                result += [tslib.Timestamp(top), freq,
                           tslib.Timestamp(asint.min()),
                           tslib.Timestamp(asint.max())]
            else:
                names += ['top', 'freq']
                result += [top, freq]

        return pd.Series(result, index=names, name=data.name)


    def describe_1d(data):
        if is_bool_dtype(data):
            return describe_categorical_1d(data)
        elif is_numeric_dtype(data):
            return describe_numeric_1d_robust(data)
        elif is_timedelta64_dtype(data):
            return describe_numeric_1d_robust(data)
        else:
            return describe_categorical_1d(data)


    if self.ndim == 1:
        return describe_1d(self)
    elif (include is None) and (exclude is None):
        # when some numerics are found, keep only numerics
        data = self.select_dtypes(include=[np.number])
        if len(data.columns) == 0:
            data = self
    elif include == 'all':
        if exclude is not None:
            msg = "exclude must be None when include is 'all'"
            raise ValueError(msg)
        data = self
    else:
        data = self.select_dtypes(include=include, exclude=exclude)


    ldesc = [describe_1d(s) for _, s in data.iteritems()]
    # set a convenient order for rows
    names = []
    ldesc_indexes = sorted([x.index for x in ldesc], key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)

    d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
    d.columns = data.columns.copy()
    return d
	def describe_robust(self, percentiles=None, include=None, exclude=None, trim=0.2):
	"""
	Monkey-patch for pd.Dataframe.describe based on robust statistics.
	Calculate trimmed mean and winsorized standard deviation with default trim 0.2.
	Uses scipy.stats.mstats (trimmed_mean, winsorized) and numpy.std

	See e.g. http://www.uh.edu/~ttian/ES.pdf for methodical background.

	BSD 3-Clause License

	Copyright (c) 2018, Daniel Kapitan (daniel@kapitan.net)
	All rights reserved.

	----------
	percentiles : list-like of numbers, optional
	The percentiles to include in the output. All should
	fall between 0 and 1. The default is
	``[.25, .5, .75]``, which returns the 25th, 50th, and
	75th percentiles.
	include : 'all', list-like of dtypes or None (default), optional
	A white list of data types to include in the result. Ignored
	for ``Series``. Here are the options:

	- 'all' : All columns of the input will be included in the output.
	- A list-like of dtypes : Limits the results to the
	provided data types.
	To limit the result to numeric types submit
	``numpy.number``. To limit it instead to object columns submit
	the ``numpy.object`` data type. Strings
	can also be used in the style of
	``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
	select pandas categorical columns, use ``'category'``
	- None (default) : The result will include all numeric columns.
	exclude : list-like of dtypes or None (default), optional,
	A black list of data types to omit from the result. Ignored
	for ``Series``. Here are the options:

	- A list-like of dtypes : Excludes the provided data types
	from the result. To exclude numeric types submit
	``numpy.number``. To exclude object columns submit the data
	type ``numpy.object``. Strings can also be used in the style of
	``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
	exclude pandas categorical columns, use ``'category'``
	- None (default) : The result will exclude nothing.
	trim : fraction to trim observation on both sides. 0.2 by default.

	Returns
	-------
	summary: Series/DataFrame of robust summary statistics



	"""
	from pandas.io.formats.format import format_percentiles
	from pandas.core.dtypes.common import (
	is_bool_dtype,
	is_numeric_dtype,
	is_datetime64_dtype)
	from scipy.stats.mstats import trimmed_mean, winsorize
	import numpy as np
	import pandas as pd


	if self.ndim >= 3:
	msg = "describe is not implemented on Panel objects."
	raise NotImplementedError(msg)
	elif self.ndim == 2 and self.columns.size == 0:
	raise ValueError("Cannot describe a DataFrame without columns")

	if percentiles is not None:
	# explicit conversion of `percentiles` to list
	percentiles = list(percentiles)

	# get them all to be in [0, 1]
	self._check_percentile(percentiles)

	# median should always be included
	if 0.5 not in percentiles:
	percentiles.append(0.5)
	percentiles = np.asarray(percentiles)
	else:
	percentiles = np.array([0.25, 0.5, 0.75])

	# sort and check for duplicates
	unique_pcts = np.unique(percentiles)
	if len(unique_pcts) < len(percentiles):
	raise ValueError("percentiles cannot contain duplicates")
	percentiles = unique_pcts
	formatted_percentiles = format_percentiles(percentiles)


	def describe_numeric_1d_robust(series):
	stat_index = (['count', 'trim_mean', 'trim_std', 'min'] +
	formatted_percentiles + ['max'])
	d = ([series.count(),
	trimmed_mean(series.dropna(), limits=trim),
	np.std(winsorize(series.dropna(), limits=trim)),
	series.min()] +
	[series.quantile(x) for x in percentiles] + [series.max()])
	return pd.Series(d, index=stat_index, name=series.name)


	def describe_categorical_1d(data):
	names = ['count', 'unique']
	objcounts = data.value_counts()
	count_unique = len(objcounts[objcounts != 0])
	result = [data.count(), count_unique]
	if result[1] > 0:
	top, freq = objcounts.index[0], objcounts.iloc[0]

	if is_datetime64_dtype(data):
	asint = data.dropna().values.view('i8')
	names += ['top', 'freq', 'first', 'last']
	result += [tslib.Timestamp(top), freq,
	tslib.Timestamp(asint.min()),
	tslib.Timestamp(asint.max())]
	else:
	names += ['top', 'freq']
	result += [top, freq]

	return pd.Series(result, index=names, name=data.name)


	def describe_1d(data):
	if is_bool_dtype(data):
	return describe_categorical_1d(data)
	elif is_numeric_dtype(data):
	return describe_numeric_1d_robust(data)
	elif is_timedelta64_dtype(data):
	return describe_numeric_1d_robust(data)
	else:
	return describe_categorical_1d(data)


	if self.ndim == 1:
	return describe_1d(self)
	elif (include is None) and (exclude is None):
	# when some numerics are found, keep only numerics
	data = self.select_dtypes(include=[np.number])
	if len(data.columns) == 0:
	data = self
	elif include == 'all':
	if exclude is not None:
	msg = "exclude must be None when include is 'all'"
	raise ValueError(msg)
	data = self
	else:
	data = self.select_dtypes(include=include, exclude=exclude)


	ldesc = [describe_1d(s) for _, s in data.iteritems()]
	# set a convenient order for rows
	names = []
	ldesc_indexes = sorted([x.index for x in ldesc], key=len)
	for idxnames in ldesc_indexes:
	for name in idxnames:
	if name not in names:
	names.append(name)

	d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
	d.columns = data.columns.copy()
	return d