Last active
July 13, 2016 19:21
-
-
Save alejio/f3d39ed967cf5aa554e1d9dae8d2ffd1 to your computer and use it in GitHub Desktop.
Python, Pandas: Helpful function for creating dummy vars on a df using pd.get_dummies()
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def df_get_dummies(df, drops, cats, pkey=None, binary=1): | |
### Import packages | |
import pandas as pd | |
import numpy as np | |
### | |
# The following columns aren't used in modelling so drop | |
df = df.drop(drops, 1) | |
# Ensure BPE and LE are categories before getting dummies | |
df[cats] = df[cats].apply(lambda x: int(x)).astype('category') | |
if pkey=None: | |
pkey = df.columns[0] | |
else: | |
pass | |
pkey_temp = df[pkey] | |
print 'Total number of columns before dummifying is %d.' % len(df.columns) | |
df = df.drop(pkey,1) | |
df = pd.get_dummies(df) | |
df = df.applymap(lambda x: int(x)).astype(np.int8) | |
df[pkey] = pkey_temp | |
# Choose to calculate IF predictor has occured for customer | |
if binary == 1: | |
df = df.groupby([pkey]).max().reset_index() | |
# Choose to calculate HOW MANY predictor occurences for customer | |
elif binary == 0: | |
df = df.groupby([pkey]).sum().reset_index() | |
# Choose to output unaggregated dataset | |
elif binary == 2: | |
pass | |
print 'Total number of columns after dummifying is %d.' % len(df.columns) | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment