Skip to content

Instantly share code, notes, and snippets.

@dutc
Created July 7, 2021 15:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dutc/ed023acd490e08877f3c6d3dacd265df to your computer and use it in GitHub Desktop.
Save dutc/ed023acd490e08877f3c6d3dacd265df to your computer and use it in GitHub Desktop.
“Python Expert” Newsletter (July 7, 2021): Learning Corner
#!/usr/bin/env python3
from functools import total_ordering
from dataclasses import dataclass
from numpy import tile, repeat
from numpy.random import default_rng
from pandas import DataFrame, date_range, Timestamp
from pandas.core.dtypes.common import is_numeric_dtype
from random import seed
from string import ascii_lowercase
from numpy import number, int64
from inspect import getfile
@dataclass
class Dummy:
value : object
@dataclass
@total_ordering
class Comparable:
value : object
def __lt__(self, other):
return self.value < other.value
@dataclass
class AlmostNumeric(int64):
value : object
def __add__(self, other):
return AlmostNumeric(self.value + other.value)
def __truediv__(self, other):
return AlmostNumeric(self.value / other.value)
assert is_numeric_dtype(AlmostNumeric)
if __name__ == '__main__':
rng = default_rng(s := Timestamp('2021-07-04').asm8.astype('uint32'))
seed(s)
tickers = rng.choice([*ascii_lowercase], size=(5, 4)).view('<U4').ravel()
dates = date_range('2021-07-04', periods=4)
df = DataFrame({
'date': repeat(dates, len(tickers)),
'ticker': tile(tickers, len(dates)),
'price': tile(
rng.normal(loc=100, scale=50, size=len(tickers)).clip(10),
len(dates)
) + rng.normal(scale=5, size=(len(dates), len(tickers))).cumsum(axis=0).ravel(),
'volume': rng.integers(0, 1_000, size=len(tickers) * len(dates)),
'signal': rng.normal(size=len(tickers) * len(dates)),
'flag': rng.choice([True, False], size=len(tickers) * len(dates)),
}).set_index(['date', 'ticker']).sort_index()
print(
df.groupby('ticker').max(),
df.groupby('ticker').min(),
# operates only on comparable columns
df.pipe(lambda df: df.assign(signal=df['signal'].apply(Dummy))).groupby('ticker').max().columns,
df.pipe(lambda df: df.assign(signal=df['signal'].apply(Comparable))).groupby('ticker').max().columns,
# operates on numeric columns (`bool` is a numeric type)
df.groupby('ticker').sum(),
df.groupby('ticker').prod(),
df.groupby('ticker').mean(),
df.groupby('ticker').median(),
df.groupby('ticker').std(),
df.groupby('ticker').var(),
df.groupby('ticker').skew(),
# operates only on numeric columns
df.pipe(lambda df: df.assign(signal=df['signal'].apply(Dummy))).groupby('ticker').mean().columns,
df.pipe(lambda df: df.assign(signal=df['signal'].apply(AlmostNumeric))).groupby('ticker').mean().columns,
df.groupby('ticker').cumsum(),
df.groupby('ticker').cumprod(),
df.groupby('ticker').cummin(),
df.groupby('ticker').cummax(),
df.groupby('ticker').count(),
df.groupby('ticker').cumcount(),
df.groupby('ticker').first(),
df.groupby('ticker').last(),
df.groupby('ticker').nth(1),
df.groupby('ticker').nth(-2),
df.groupby('ticker').rank(),
df.pipe(lambda x: x[x.columns.difference({'flag'})])
.groupby('ticker').pct_change(),
sep=f'\n{"-" * 78}\n',
)
@dutc
Copy link
Author

dutc commented Jul 7, 2021

As you can see:

  • the .groupby function has a number of predefined aggregation/reduction operations.
  • some operations may return a subset of the DataFrame, applying the aggregation only to relevant (e.g., comparable or numeric) columns
  • the return value of a .groupby operation may be a pandas.DataFrame or pandas.Series (consistent with the input type, irrespective of the number of columns reduced over) with an .index set to the grouped value

@dutc
Copy link
Author

dutc commented Jul 7, 2021

For the full write-up and discussion, sign up for the “Python Expert” newsletter!

bit.ly/expert-python

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment