Skip to content

Instantly share code, notes, and snippets.

@srishtis
Last active September 15, 2018 04:59
Show Gist options
  • Save srishtis/d2052c4c4c0f918604c351faedbf4217 to your computer and use it in GitHub Desktop.
Save srishtis/d2052c4c4c0f918604c351faedbf4217 to your computer and use it in GitHub Desktop.
Creation of features for classifications
#Creating Goal and Duration percentile buckets
#will create percentile buckets for the goal amount in a category#will c
kick_projects['goal_cat_perc'] = kick_projects.groupby(['category'])['goal'].transform(
lambda x: pd.qcut(x, [0, .35, .70, 1.0], labels =[1,2,3]))
#will create percentile buckets for the duration in a category
kick_projects['duration_cat_perc'] = kick_projects.groupby(['category'])['duration'].transform(
lambda x: pd.qcut(x, [0, .35, .70, 1.0], labels =False, duplicates='drop'))
####################################################################################################
#using above columns to create PARTICIPANTS features for three time periods: quarter, week and month
####################################################################################################
#creating a metric to see number of competitors for a given project in a given quarter
#number of participants in a given category, that launched in the same year and quarter and in the same goal bucket
ks_particpants_qtr=kick_projects.groupby(['category','launched_year','launched_quarter','goal_cat_perc']).count()
ks_particpants_qtr=ks_particpants_qtr[['name']]
#since the above table has all group by columns created as index, converting them into columns
ks_particpants_qtr.reset_index(inplace=True)
#creating a metric to see number of competitors for a given project in a given month
#number of participants in a given category, that launched in the same year and month and in the same goal bucket
ks_particpants_mth=kick_projects.groupby(['category','launched_year','launched_month','goal_cat_perc']).count()
ks_particpants_mth=ks_particpants_mth[['name']]
#since the above table has all group by columns created as index, converting them into columns
ks_particpants_mth.reset_index(inplace=True)
#creating a metric to see number of competitors for a given project in a given week
#number of participants in a given category, that launched in the same year and week and in the same goal bucket
ks_particpants_wk=kick_projects.groupby(['category','launched_year','launched_week','goal_cat_perc']).count()
ks_particpants_wk=ks_particpants_wk[['name']]
#since the above table has all group by columns created as index, converting them into columns
ks_particpants_wk.reset_index(inplace=True)
#additional features from goal, pledge and backers columns
kick_projects.loc[:,'goal_reached'] = kick_projects['pledged'] / kick_projects['goal'] # Pledged amount as a percentage of goal.
#The above field will be used to compute another metric
# In backers column, impute 0 with 1 to prevent undefined division.
kick_projects.loc[kick_projects['backers'] == 0, 'backers'] = 1
kick_projects.loc[:,'pledge_per_backer'] = kick_projects['pledged'] / kick_projects['backers'] # Pledged amount per backer.
#####################################################################
# using these metrics to create AVG_PPB and AVG_SUCCESS_RATE features
#####################################################################
#creating 2 metrics to get average pledge per backer for a category in a year according to the goal bucket it lies in and the success rate ie average pledged to goal ratio for the category and goal bucket in this year#creati
#using pledge_per_backer and goal_reached (computed earlier) and averaging it by category in a launch year belonging to a goal bucket
ks_ppb_goal=pd.DataFrame(kick_projects.groupby(['category','launched_year','goal_cat_perc'])['pledge_per_backer','goal_reached'].mean())
#since the above table has all group by columns created as index, converting them into columns
ks_ppb_goal.reset_index(inplace=True)
#renaming column
ks_ppb_goal.columns= ['category','launched_year','goal_cat_perc','avg_ppb_goal','avg_success_rate_goal']
#creating a metric: the success rate ie average pledged to goal ratio for the category in this year according to the duration buckets
ks_ppb_duration=pd.DataFrame(kick_projects.groupby(['category','launched_year','duration_cat_perc'])['goal_reached'].mean())
#since the above table has all group by columns created as index, converting them into columns
ks_ppb_duration.reset_index(inplace=True)
#renaming column
ks_ppb_duration.columns= ['category','launched_year','duration_cat_perc','avg_success_rate_duration']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment