Created
July 7, 2021 15:56
-
-
Save dutc/ed023acd490e08877f3c6d3dacd265df to your computer and use it in GitHub Desktop.
“Python Expert” Newsletter (July 7, 2021): Learning Corner
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from functools import total_ordering | |
from dataclasses import dataclass | |
from numpy import tile, repeat | |
from numpy.random import default_rng | |
from pandas import DataFrame, date_range, Timestamp | |
from pandas.core.dtypes.common import is_numeric_dtype | |
from random import seed | |
from string import ascii_lowercase | |
from numpy import number, int64 | |
from inspect import getfile | |
@dataclass | |
class Dummy: | |
value : object | |
@dataclass | |
@total_ordering | |
class Comparable: | |
value : object | |
def __lt__(self, other): | |
return self.value < other.value | |
@dataclass | |
class AlmostNumeric(int64): | |
value : object | |
def __add__(self, other): | |
return AlmostNumeric(self.value + other.value) | |
def __truediv__(self, other): | |
return AlmostNumeric(self.value / other.value) | |
assert is_numeric_dtype(AlmostNumeric) | |
if __name__ == '__main__': | |
rng = default_rng(s := Timestamp('2021-07-04').asm8.astype('uint32')) | |
seed(s) | |
tickers = rng.choice([*ascii_lowercase], size=(5, 4)).view('<U4').ravel() | |
dates = date_range('2021-07-04', periods=4) | |
df = DataFrame({ | |
'date': repeat(dates, len(tickers)), | |
'ticker': tile(tickers, len(dates)), | |
'price': tile( | |
rng.normal(loc=100, scale=50, size=len(tickers)).clip(10), | |
len(dates) | |
) + rng.normal(scale=5, size=(len(dates), len(tickers))).cumsum(axis=0).ravel(), | |
'volume': rng.integers(0, 1_000, size=len(tickers) * len(dates)), | |
'signal': rng.normal(size=len(tickers) * len(dates)), | |
'flag': rng.choice([True, False], size=len(tickers) * len(dates)), | |
}).set_index(['date', 'ticker']).sort_index() | |
print( | |
df.groupby('ticker').max(), | |
df.groupby('ticker').min(), | |
# operates only on comparable columns | |
df.pipe(lambda df: df.assign(signal=df['signal'].apply(Dummy))).groupby('ticker').max().columns, | |
df.pipe(lambda df: df.assign(signal=df['signal'].apply(Comparable))).groupby('ticker').max().columns, | |
# operates on numeric columns (`bool` is a numeric type) | |
df.groupby('ticker').sum(), | |
df.groupby('ticker').prod(), | |
df.groupby('ticker').mean(), | |
df.groupby('ticker').median(), | |
df.groupby('ticker').std(), | |
df.groupby('ticker').var(), | |
df.groupby('ticker').skew(), | |
# operates only on numeric columns | |
df.pipe(lambda df: df.assign(signal=df['signal'].apply(Dummy))).groupby('ticker').mean().columns, | |
df.pipe(lambda df: df.assign(signal=df['signal'].apply(AlmostNumeric))).groupby('ticker').mean().columns, | |
df.groupby('ticker').cumsum(), | |
df.groupby('ticker').cumprod(), | |
df.groupby('ticker').cummin(), | |
df.groupby('ticker').cummax(), | |
df.groupby('ticker').count(), | |
df.groupby('ticker').cumcount(), | |
df.groupby('ticker').first(), | |
df.groupby('ticker').last(), | |
df.groupby('ticker').nth(1), | |
df.groupby('ticker').nth(-2), | |
df.groupby('ticker').rank(), | |
df.pipe(lambda x: x[x.columns.difference({'flag'})]) | |
.groupby('ticker').pct_change(), | |
sep=f'\n{"-" * 78}\n', | |
) |
For the full write-up and discussion, sign up for the “Python Expert” newsletter!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
As you can see:
.groupby
function has a number of predefined aggregation/reduction operations.DataFrame
, applying the aggregation only to relevant (e.g., comparable or numeric) columns.groupby
operation may be apandas.DataFrame
orpandas.Series
(consistent with the input type, irrespective of the number of columns reduced over) with an .index
set to the grouped value