Clean and Enrich event data from Mailchimp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sqlalchemy import create_engine | |
from difflib import SequenceMatcher | |
import pandas as padas | |
import numpy as np | |
engine = create_engine('postgresql://USERNAME:PASSWORD@HOSTNAME:5432/DATABASE') | |
joinedQuery = "select mailchimp_report_email_activity.email_address as email, merge_fields_lname as last_name, merge_fields_fname as first_name, action, timestamp from mailchimp_report_email_activity left join mailchimp_list_members on mailchimp_report_email_activity.email_address = mailchimp_list_members.email_address;" | |
domains = ['gmail', 'yahoo', 'outlook'] #we might exclude yahoo employees or not? | |
joinedEventsFrame = padas.read_sql_query(joinedQuery, engine) | |
#load names list | |
fNames = padas.read_csv("./femalenames", | |
delim_whitespace=True, header=None,names=['name','freq','cFreq','rank'],na_values=["NaN"], keep_default_na=False) | |
mNames = padas.read_csv("./malenames", | |
delim_whitespace=True, header=None,names=['name','freq','cFreq','rank']) | |
#convert names to lower case, NaNs are ok with strs | |
fNames['name'] = fNames.name.str.lower() | |
mNames['name'] = mNames.name.str.lower() | |
#two cleaning problems, empty strings and None values | |
joinedEventsFrame['first_name'] = joinedEventsFrame['first_name'].replace(to_replace='', value = np.nan) | |
joinedEventsFrame = joinedEventsFrame.dropna() | |
def occuredAt(time): | |
result = np.nan | |
if time >=9 and time <= 17: | |
result = 'during' | |
elif time >= 0 and time < 9: | |
result = 'before' | |
elif time > 17 and time <24: | |
result = 'after' | |
return result | |
def isBusiness(st): | |
domain = st.split("@")[1] | |
res = True | |
for s in domains: | |
if s in domain: | |
res = False | |
break | |
return res | |
def findGender(str): | |
f = fNames['name'].apply(lambda x: (x, SequenceMatcher(None,str.lower(),x).ratio())) | |
m = mNames['name'].apply(lambda x: (x, SequenceMatcher(None,str.lower(),x).ratio())) | |
f = padas.DataFrame([i for i in f], columns=['name','match']).sort_values('match', ascending=False) | |
m = padas.DataFrame([i for i in m], columns=['name','match']).sort_values('match', ascending=False) | |
maxMScore = m.iloc[0]['match'] | |
maleName = m.iloc[0]['name'] | |
maxFScore = f.iloc[0]['match'] | |
femaleName = f.iloc[0]['name'] | |
gender = np.nan | |
if maxFScore > maxMScore: | |
gender = 'female' | |
elif maxFScore < maxMScore: | |
gender = 'male' | |
else: | |
femScores = fNames.loc[fNames['name'] == femaleName].iloc[0]['freq'] | |
memScores = mNames.loc[mNames['name'] == maleName].iloc[0]['freq'] | |
if femScores > memScores: | |
gender = 'female' | |
elif femScores < memScores: | |
gender = 'male' | |
return gender | |
joinedEventsFrame = joinedEventsFrame.merge(joinedEventsFrame['first_name'].apply(findGender).to_frame(name='gender'), left_index=True, right_index=True) | |
joinedEventsFrame = joinedEventsFrame.merge(joinedEventsFrame['email'].apply(isBusiness).to_frame(name='isBusiness'), left_index=True, right_index=True) | |
k = padas.concat([joinedEventsFrame,padas.DataFrame(padas.DatetimeIndex(joinedEventsFrame['timestamp']).hour,index = joinedEventsFrame.index, columns=['hours'])],axis=1) | |
k['hours'] = k['hours'].apply(occuredAt) | |
print(k) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment