Skip to content

Instantly share code, notes, and snippets.

@ayushkumarshah
Last active June 1, 2020 14:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ayushkumarshah/aa35d7fbfb9474d2a615665766d20a35 to your computer and use it in GitHub Desktop.
Save ayushkumarshah/aa35d7fbfb9474d2a615665766d20a35 to your computer and use it in GitHub Desktop.
Pandas pipeline
import numpy as mp
import pandas as pd
import datetime as dt
def df_info(f):
def wrapper(df, *args, **kwargs):
tic = dt.datetime.now()
result = f(df, *args, **kwargs)
toc = dt.datetime.now()
print("\n\n{} took {} time\n".format(f.__name__, toc - tic))
print("After applying {}\n".format(f.__name__))
print("Shape of df = {}\n".format(result.shape))
print("Columns of df are {}\n".format(result.columns))
print("Index of df is {}\n".format(result.index))
for i in range(100): print("-", end='')
return result
return wrapper
def start_pipeline(df):
return df.copy()
@df_info
def create_dateindex(df):
df.index = pd.to_datetime(df.index, format="%Y%m%d")
return df
@df_info
def remove_columns(df):
df.drop([*df.columns[4:10], *df.columns[11:15], 'posNeg', 'fips'],
axis=1, inplace=True)
return df
@df_info
def fill_missing(df):
df.fillna(value=0, inplace=True)
return df
@df_info
def add_state_name(df):
_df = pd.read_csv('data/state_info.csv', usecols=['state', 'name'])
df = (df
.reset_index()
.merge(_df, on='state', how='left', left_index=True))
df.set_index('date', inplace=True)
df.rename(columns={'name': 'state_name'}, inplace=True)
return df
@df_info
def drop_state(df):
df.drop(columns=['state'], inplace=True)
return df
@df_info
def sample_daily(df):
df = df.resample('D').sum()
return df
@df_info
def add_active_cases(df):
df['active'] = df['positive'] - df['death'] - df['recovered']
return df
def aggregate_monthly(df, month):
df = (df.loc[month]
.groupby('state_name')
.agg({'positive': 'first',
'negative': 'first',
'pending': 'first',
'recovered': 'first',
'death': 'first',
'hospitalized': 'first',
'total': 'first',
'totalTestResults': 'first',
'deathIncrease': 'sum',
'hospitalizedIncrease': 'sum',
'negativeIncrease': 'sum',
'positiveIncrease': 'sum',
'totalTestResultsIncrease': 'sum'}))
return df
@df_info
def create_month_only(df, month):
df_current = aggregate_monthly(df, month)
if int(month[-2:]) == 0:
prev_month = str(int(month[:4]) - 1) + '-12'
else:
prev_month = month[:5] + '{:02d}'.format(int(month[-2:])-1)
df_previous = aggregate_monthly(df, prev_month)
df = df_current.sub(df_previous)
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment