Skip to content

Instantly share code, notes, and snippets.

Created May 28, 2016 22:13
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
What would you like to do?
Clean and Enrich event data from Mailchimp
from sqlalchemy import create_engine
from difflib import SequenceMatcher
import pandas as padas
import numpy as np
engine = create_engine('postgresql://USERNAME:PASSWORD@HOSTNAME:5432/DATABASE')
joinedQuery = "select mailchimp_report_email_activity.email_address as email, merge_fields_lname as last_name, merge_fields_fname as first_name, action, timestamp from mailchimp_report_email_activity left join mailchimp_list_members on mailchimp_report_email_activity.email_address = mailchimp_list_members.email_address;"
domains = ['gmail', 'yahoo', 'outlook'] #we might exclude yahoo employees or not?
joinedEventsFrame = padas.read_sql_query(joinedQuery, engine)
#load names list
fNames = padas.read_csv("./femalenames",
delim_whitespace=True, header=None,names=['name','freq','cFreq','rank'],na_values=["NaN"], keep_default_na=False)
mNames = padas.read_csv("./malenames",
delim_whitespace=True, header=None,names=['name','freq','cFreq','rank'])
#convert names to lower case, NaNs are ok with strs
fNames['name'] =
mNames['name'] =
#two cleaning problems, empty strings and None values
joinedEventsFrame['first_name'] = joinedEventsFrame['first_name'].replace(to_replace='', value = np.nan)
joinedEventsFrame = joinedEventsFrame.dropna()
def occuredAt(time):
result = np.nan
if time >=9 and time <= 17:
result = 'during'
elif time >= 0 and time < 9:
result = 'before'
elif time > 17 and time <24:
result = 'after'
return result
def isBusiness(st):
domain = st.split("@")[1]
res = True
for s in domains:
if s in domain:
res = False
return res
def findGender(str):
f = fNames['name'].apply(lambda x: (x, SequenceMatcher(None,str.lower(),x).ratio()))
m = mNames['name'].apply(lambda x: (x, SequenceMatcher(None,str.lower(),x).ratio()))
f = padas.DataFrame([i for i in f], columns=['name','match']).sort_values('match', ascending=False)
m = padas.DataFrame([i for i in m], columns=['name','match']).sort_values('match', ascending=False)
maxMScore = m.iloc[0]['match']
maleName = m.iloc[0]['name']
maxFScore = f.iloc[0]['match']
femaleName = f.iloc[0]['name']
gender = np.nan
if maxFScore > maxMScore:
gender = 'female'
elif maxFScore < maxMScore:
gender = 'male'
femScores = fNames.loc[fNames['name'] == femaleName].iloc[0]['freq']
memScores = mNames.loc[mNames['name'] == maleName].iloc[0]['freq']
if femScores > memScores:
gender = 'female'
elif femScores < memScores:
gender = 'male'
return gender
joinedEventsFrame = joinedEventsFrame.merge(joinedEventsFrame['first_name'].apply(findGender).to_frame(name='gender'), left_index=True, right_index=True)
joinedEventsFrame = joinedEventsFrame.merge(joinedEventsFrame['email'].apply(isBusiness).to_frame(name='isBusiness'), left_index=True, right_index=True)
k = padas.concat([joinedEventsFrame,padas.DataFrame(padas.DatetimeIndex(joinedEventsFrame['timestamp']).hour,index = joinedEventsFrame.index, columns=['hours'])],axis=1)
k['hours'] = k['hours'].apply(occuredAt)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment