Skip to content

Instantly share code, notes, and snippets.

@BigAN
Last active October 10, 2018 01:19
Show Gist options
  • Save BigAN/e6942f3059710d19b1cf0b3534df1fb0 to your computer and use it in GitHub Desktop.
Save BigAN/e6942f3059710d19b1cf0b3534df1fb0 to your computer and use it in GitHub Desktop.
EddyFlexible Lead/Lag Feature Generation
"""
This script provides reusable code for generating lead/lag time
delta features (using epoch time) for an arbitrary choice of lead/lag orders.
You can use this to generate useful visit time delta features for
this competition,and it should be fairly straightforward to
apply the functions to other datasets as well. Feel free to just
take the output from this kernel as features, they'll match the original
order of train and test. I hope it's helpful!
@author: Joseph Eddy
"""
import numpy as np
import pandas as pd
def add_orig_ind_cols(dfs):
'''
Add tracker column for original df orders
'''
for df in dfs:
df['orig_ind'] = df.index.values
def restore_orig_orders(dfs):
'''
Restore original df orders, assumes an 'orig_ind' column
'''
for df in dfs:
df.sort_values(by='orig_ind', inplace=True)
df.drop(['orig_ind'], axis=1, inplace=True)
def add_grouped_time_delta_features(df, time_col, group_cols, shifts):
'''
For epoch time, compute deltas with the specified shift on sequences
aggregated by group_cols, return df with new columns
'''
# sort by time
df = df.sort_values(by=time_col)
for shift in shifts:
feat_name = '_'.join(group_cols) + ('_delta_shift_%d' % shift)
df[feat_name] = (df.groupby(group_cols)
[time_col].shift(shift) - df[time_col]).astype(np.float32)
df[feat_name] = df[feat_name] * -1 * np.sign(shift) # flip sign for lags
df[feat_name] = df[feat_name].fillna(-1).astype('uint32')
return df
read_cols = ['fullVisitorId', 'visitStartTime']
read_types = {'fullVisitorId': 'str'}
X_train = pd.read_csv('../input/train.csv', usecols=read_cols, dtype=read_types)
X_test = pd.read_csv('../input/test.csv', usecols=read_cols, dtype=read_types)
# Track original df order to restore at end of feature engineering
add_orig_ind_cols([X_train, X_test])
### Visitor time delta features: lags and leads to order 3
###
print('Extracting time delta features features...\n')
lags = [x for x in range(-3,4) if x != 0]
X_train = add_grouped_time_delta_features(X_train, 'visitStartTime', ['fullVisitorId'], lags)
X_test = add_grouped_time_delta_features(X_test, 'visitStartTime', ['fullVisitorId'], lags)
# Restore original df order and save new feature outputs
restore_orig_orders([X_train, X_test])
print('Saving feature output...')
X_train.drop('visitStartTime', axis=1).to_csv('train_delta_feats.csv',index=False)
X_test.drop('visitStartTime', axis=1).to_csv('test_delta_feats.csv',index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment