Skip to content

Instantly share code, notes, and snippets.

@cpard
cpard / mailchimpLogReg.py
Created July 4, 2016 11:45
Logistic Regression on email campaign data coming from Mailchimp
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 5 18:55:20 2016
@author: cpard
"""
from sqlalchemy import create_engine
import pandas as padas
import numpy as np
import statsmodels.api as sm
@cpard
cpard / MAilchimpEnrich.py
Created May 28, 2016 22:13
Clean and Enrich event data from Mailchimp
from sqlalchemy import create_engine
from difflib import SequenceMatcher
import pandas as padas
import numpy as np
engine = create_engine('postgresql://USERNAME:PASSWORD@HOSTNAME:5432/DATABASE')
joinedQuery = "select mailchimp_report_email_activity.email_address as email, merge_fields_lname as last_name, merge_fields_fname as first_name, action, timestamp from mailchimp_report_email_activity left join mailchimp_list_members on mailchimp_report_email_activity.email_address = mailchimp_list_members.email_address;"
domains = ['gmail', 'yahoo', 'outlook'] #we might exclude yahoo employees or not?
@cpard
cpard / TimeBinning.py
Created May 28, 2016 22:09
Find out in which part of the day the event occured
padas.concat([joinedEventsFrame,padas.DataFrame(padas.DatetimeIndex(joinedEventsFrame['timestamp']).hour,index = joinedEventsFrame.index, columns=['hours'])],axis=1)
def occuredAt(time):
result = np.nan
if time >=9 and time <= 17:
result = 'during'
elif time >= 0 and time < 9:
result = 'before'
elif time > 17 and time <24:
result = 'after'
@cpard
cpard / IsBusiness.py
Last active May 28, 2016 22:06
Function to find if a recipient is mailing us from a business email or not
domains = ['gmail', 'yahoo', 'outlook']
def isBusiness(st):
domain = st.split("@")[1]
res = True
for s in domains:
if s in domain:
res = False
break
return res
@cpard
cpard / JoinedEventsWithGender.py
Created May 28, 2016 22:03
Events enriched with the gender of the recipient
joinedEventsFrame = joinedEventsFrame.merge(joinedEventsFrame['first_name'].apply(findGender).to_frame(name='gender'), left_index=True, right_index=True)
@cpard
cpard / findGender.py
Created May 28, 2016 22:01
The implementation of the findGender function
def findGender(str):
f = fNames['name'].apply(lambda x: (x, SequenceMatcher(None,str.lower(),x).ratio()))
m = mNames['name'].apply(lambda x: (x, SequenceMatcher(None,str.lower(),x).ratio()))
f = padas.DataFrame([i for i in f], columns=['name','match']).sort_values('match', ascending=False)
m = padas.DataFrame([i for i in m], columns=['name','match']).sort_values('match', ascending=False)
maxMScore = m.iloc[0]['match']
maleName = m.iloc[0]['name']
maxFScore = f.iloc[0]['match']
@cpard
cpard / CleaningPart.py
Created May 28, 2016 21:50
Basic Cleaning of our data
fNames['name'] = fNames.name.str.lower()
mNames['name'] = mNames.name.str.lower()
joinedEventsFrame['first_name'] = joinedEventsFrame['first_name'].replace(to_replace='', value = np.nan)
joinedEventsFrame = joinedEventsFrame.dropna()
@cpard
cpard / Census.py
Last active May 28, 2016 21:47
Read the name data from US Census into Pandas DataFrames
fNames = padas.read_csv("./femalenames",
delim_whitespace=True, header=None,names=['name','freq','cFreq','rank'],na_values=["NaN"], keep_default_na=False)
mNames = padas.read_csv("./malenames",
delim_whitespace=True, header=None,names=['name','freq','cFreq','rank'])
@cpard
cpard / MailchimpImportdata.py
Created May 28, 2016 21:46
Import Mailchimp Data with Python & Pandas
from sqlalchemy import create_engine
import pandas as padas
engine = create_engine('postgresql://USERNAME:PASSWORD@HOST:PORT/DATABASE)
joinedQuery = "select mailchimp_report_email_activity.email_address as email, merge_fields_lname as last_name, merge_fields_fname as first_name, action, timestamp from mailchimp_report_email_activity left join mailchimp_list_members on mailchimp_report_email_activity.email_address = mailchimp_list_members.email_address;"
joinedEventsFrame = padas.read_sql_query(joinedQuery, engine)
@cpard
cpard / gist:715a40ba36d7858bdd11
Created March 22, 2016 13:26
Kafka Connecto Source Task Interface
public String version()
public void start(Map<String, String> map)
public List<SourceRecord> poll() throws InterruptedException
public void stop()