Skip to content

Instantly share code, notes, and snippets.

@SohierDane
Last active January 26, 2017 00:50
Show Gist options
  • Save SohierDane/66a606556042433527bff59bd72406f9 to your computer and use it in GitHub Desktop.
Save SohierDane/66a606556042433527bff59bd72406f9 to your computer and use it in GitHub Desktop.
"""
Calculates key performance indicators for the 1 click appliers.
Manual validation of 5 cl missing reply link failures (is link actually there?)
https://columbus.craigslist.org/eng/5939246199.html', correct
'https://seattle.craigslist.org/tac/edu/5946859577.html', correct
'https://seattle.craigslist.org/see/egr/5955565817.html', correct
'https://minneapolis.craigslist.org/hnp/npo/5961895846.html', correct
'https://seattle.craigslist.org/see/hea/5964050043.html correct
Manual validation of 5 cl missing reply email failures (is email actually there?)
[u'https://columbus.craigslist.org/eng/5939246199.html', correct
u'https://seattle.craigslist.org/tac/edu/5946859577.html', correct
u'https://seattle.craigslist.org/see/egr/5955565817.html', correct
u'https://columbus.craigslist.org/sof/5935009087.html', wrong/has reply email
u'https://minneapolis.craigslist.org/hnp/npo/5961895846.html' correct]
"""
import pandas as pd
from sqlalchemy import create_engine
from collections import OrderedDict
def user_ids_to_ignore():
return {6134, 151, 28808, 32952, 32953, 8177, 32949, 15, 8176, 320, 179, 1, 41269, 41265,
6152, 375, 6119, 6120, 6084, 278, 4196, 6064, 21718, 14471, 21163, 15267, 9847,
212, 324, 207, 272, 134, 209, 210, 190, 144, 196, 321, 140, 88, 170, 188, 243,
169, 120, 15243, 6167, 217, 132, 133, 8184, 15244, 15245, 15246, 8164, 317, 215,
148, 160, 182, 21717, 6148, 6150, 6135, 6113, 6146, 6166, 216, 202, 203, 197, 219,
161, 271, 198, 6162, 21171, 359, 386, 8163, 194, 159, 239, 6158, 6161, 6163, 195,
255, 8178, 51, 41}
def load_job_scouts():
pg_user_name = 'sohier'
pg_database_name = 'talentworks_development'
engine = create_engine('postgresql://{0}@localhost:5432/{1}'.format(
pg_user_name, pg_database_name))
query = "select * from job_scouts where created_at > '20170101'::date"
df = pd.read_sql_query(query, con=engine)
return df[~df.user_id.isin(user_ids_to_ignore()) & ~df.aasm_state.isin(
['deleted', 'not_valid'])]
def load_postings(job_scout_df):
pg_user_name = 'sohier'
pg_database_name = 'talentworks_development'
engine = create_engine('postgresql://{0}@localhost:5432/{1}'.format(
pg_user_name, pg_database_name))
postings = pd.read_sql_query('select * from postings', con=engine)
postings.rename(columns={x: 'post_'+x for x in postings.columns if x in
job_scout_df.columns}, inplace=True)
return postings
def has_events_in_context(df):
df = df.dropna(subset=['context'])
return df[df.context.apply(lambda x: 'events' in x.keys())]
def began_application(context):
for event in [x for x in context['events'] if 'event' in x.keys()]:
if "Applying..." in event['event']:
return True
return False
def finished_application(context):
for event in [x for x in context['events'] if 'event' in x.keys()]:
if event['event'].startswith('Applied'):
return True
return False
def user_rejected(context):
for event in [x for x in context['events'] if 'event' in x.keys()]:
if event['event'].startswith('Rejected'):
return True
return False
def user_requested_application(context):
for event in [x for x in context['events'] if 'msg' in x.keys()]:
if "You told us that you'd like us to apply to this job on your behalf." in event['msg']:
return True
return False
def times_requested_more_info(context):
return len([event['event'] for event in [x for x in context['events'] if 'msg' in x.keys()]
if event['event'].startswith("Info Needed")])
def times_user_gave_requested_info(context):
return len([event['event'] for event in [x for x in context['events'] if 'event' in x.keys()]
if "Info Provided" in event['event']])
def get_parser(post_context):
if not post_context:
return None
if 'parser' not in post_context:
return None
return post_context['parser']
def has_reply_link(post_context):
if not post_context:
return False
if 'reply_link'not in post_context:
return False
if len(post_context['reply_link']) > 0:
return True
def has_reply_email(post_context):
if not post_context:
return False
if 'reply_email'not in post_context:
return False
if len(post_context['reply_email']) > 0:
return True
def recent_posts_with_no_parser(df):
days_to_pull = 2
return df[df.parser.isnull() & (df.created_at.apply(pd.datetime.date) >
(pd.datetime.today() - pd.DateOffset(days_to_pull)).date())]
def calc_kpis(df):
kpis = OrderedDict()
kpis['num_job_scouts'] = len(load_job_scouts())
kpis['num_applications_started'] = sum(df.began_application)
kpis['posts_with_no_parser'] = len(df[df.parser.isnull()])
kpis['application_success_by_ATS'] = df[
['parser', 'began_application', 'user_rejected_after_began', 'finished_application',
'failed', 'user_gave_all_requested_info', 'missing_info', 'failed_missing_info']].groupby('parser').sum()
kpis['application_success_by_ATS']['job_scouts'] = df[['parser', 'id']].groupby(
'parser').count()
column_totals = kpis['application_success_by_ATS'].sum()
column_totals.name = "Total"
kpis['application_success_by_ATS'] = kpis['application_success_by_ATS'].append(column_totals)
return kpis
def process_job_scouts(df):
df['has_events_in_context'] = df.index.isin(has_events_in_context(df).index)
df = df[df['has_events_in_context']]
df['application_requested'] = df['context'].apply(user_requested_application)
df['began_application'] = df['context'].apply(began_application)
df['user_rejected'] = df['context'].apply(user_rejected)
df['user_rejected_after_began'] = (df.user_rejected & df.began_application)
df['finished_application'] = df['context'].apply(finished_application)
df['failed'] = (df.began_application & ~df.user_rejected & ~df.finished_application)
df['times_requested_more_info'] = df['context'].apply(times_requested_more_info)
df['times_user_gave_requested_info'] = df['context'].apply(times_user_gave_requested_info)
df['missing_info'] = df['times_requested_more_info'] > df['times_user_gave_requested_info']
df['failed_missing_info'] = (df.failed & df.missing_info)
df['user_gave_all_requested_info'] = (df.began_application &
(df.times_user_gave_requested_info > 0) & ~df.failed_missing_info)
df['parser'] = df['post_context'].apply(get_parser)
return df
if __name__ == '__main__':
df = load_job_scouts()
postings = load_postings(df)
df = pd.merge(df, postings, left_on='posting_id', right_on='post_id')
df = process_job_scouts(df)
kpis = calc_kpis(df)
print('\n\n')
for label, kpi in kpis.iteritems():
if label == 'application_success_by_ATS':
print('\n')
print(kpi.to_string())
print('\n')
else:
print label, kpi
cl_df = df[(df.parser == 'CraigslistParser') & df.failed].copy()
cl_df['has_reply_link'] = cl_df.post_context.apply(has_reply_link)
cl_df['has_reply_email'] = cl_df.post_context.apply(has_reply_email)
cl_df['unexplained_failure'] = (cl_df.has_reply_link & cl_df.has_reply_email)
print "Craigslist failures due to missing reply link: %i" % sum(~cl_df['has_reply_link'])
print "Craigslist failures due to missing reply email: %i" % sum(
~cl_df['has_reply_email'] & cl_df['has_reply_link'])
print "Craigslist failures with unknown cause: %i" % sum(cl_df['unexplained_failure'])
# print "Unexplained failures concentrated in the following user IDs:"
# print cl_df[cl_df.unexplained_failure][['unexplained_failure', 'user_id']].groupby('user_id').count().to_string()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment