Skip to content

Instantly share code, notes, and snippets.

@alejio
Last active July 13, 2016 19:21
Show Gist options
  • Save alejio/f3d39ed967cf5aa554e1d9dae8d2ffd1 to your computer and use it in GitHub Desktop.
Save alejio/f3d39ed967cf5aa554e1d9dae8d2ffd1 to your computer and use it in GitHub Desktop.
Python, Pandas: Helpful function for creating dummy vars on a df using pd.get_dummies()
def df_get_dummies(df, drops, cats, pkey=None, binary=1):
### Import packages
import pandas as pd
import numpy as np
###
# The following columns aren't used in modelling so drop
df = df.drop(drops, 1)
# Ensure BPE and LE are categories before getting dummies
df[cats] = df[cats].apply(lambda x: int(x)).astype('category')
if pkey=None:
pkey = df.columns[0]
else:
pass
pkey_temp = df[pkey]
print 'Total number of columns before dummifying is %d.' % len(df.columns)
df = df.drop(pkey,1)
df = pd.get_dummies(df)
df = df.applymap(lambda x: int(x)).astype(np.int8)
df[pkey] = pkey_temp
# Choose to calculate IF predictor has occured for customer
if binary == 1:
df = df.groupby([pkey]).max().reset_index()
# Choose to calculate HOW MANY predictor occurences for customer
elif binary == 0:
df = df.groupby([pkey]).sum().reset_index()
# Choose to output unaggregated dataset
elif binary == 2:
pass
print 'Total number of columns after dummifying is %d.' % len(df.columns)
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment