Last active
September 15, 2018 04:59
-
-
Save srishtis/d2052c4c4c0f918604c351faedbf4217 to your computer and use it in GitHub Desktop.
Creation of features for classifications
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Creating Goal and Duration percentile buckets | |
#will create percentile buckets for the goal amount in a category#will c | |
kick_projects['goal_cat_perc'] = kick_projects.groupby(['category'])['goal'].transform( | |
lambda x: pd.qcut(x, [0, .35, .70, 1.0], labels =[1,2,3])) | |
#will create percentile buckets for the duration in a category | |
kick_projects['duration_cat_perc'] = kick_projects.groupby(['category'])['duration'].transform( | |
lambda x: pd.qcut(x, [0, .35, .70, 1.0], labels =False, duplicates='drop')) | |
#################################################################################################### | |
#using above columns to create PARTICIPANTS features for three time periods: quarter, week and month | |
#################################################################################################### | |
#creating a metric to see number of competitors for a given project in a given quarter | |
#number of participants in a given category, that launched in the same year and quarter and in the same goal bucket | |
ks_particpants_qtr=kick_projects.groupby(['category','launched_year','launched_quarter','goal_cat_perc']).count() | |
ks_particpants_qtr=ks_particpants_qtr[['name']] | |
#since the above table has all group by columns created as index, converting them into columns | |
ks_particpants_qtr.reset_index(inplace=True) | |
#creating a metric to see number of competitors for a given project in a given month | |
#number of participants in a given category, that launched in the same year and month and in the same goal bucket | |
ks_particpants_mth=kick_projects.groupby(['category','launched_year','launched_month','goal_cat_perc']).count() | |
ks_particpants_mth=ks_particpants_mth[['name']] | |
#since the above table has all group by columns created as index, converting them into columns | |
ks_particpants_mth.reset_index(inplace=True) | |
#creating a metric to see number of competitors for a given project in a given week | |
#number of participants in a given category, that launched in the same year and week and in the same goal bucket | |
ks_particpants_wk=kick_projects.groupby(['category','launched_year','launched_week','goal_cat_perc']).count() | |
ks_particpants_wk=ks_particpants_wk[['name']] | |
#since the above table has all group by columns created as index, converting them into columns | |
ks_particpants_wk.reset_index(inplace=True) | |
#additional features from goal, pledge and backers columns | |
kick_projects.loc[:,'goal_reached'] = kick_projects['pledged'] / kick_projects['goal'] # Pledged amount as a percentage of goal. | |
#The above field will be used to compute another metric | |
# In backers column, impute 0 with 1 to prevent undefined division. | |
kick_projects.loc[kick_projects['backers'] == 0, 'backers'] = 1 | |
kick_projects.loc[:,'pledge_per_backer'] = kick_projects['pledged'] / kick_projects['backers'] # Pledged amount per backer. | |
##################################################################### | |
# using these metrics to create AVG_PPB and AVG_SUCCESS_RATE features | |
##################################################################### | |
#creating 2 metrics to get average pledge per backer for a category in a year according to the goal bucket it lies in and the success rate ie average pledged to goal ratio for the category and goal bucket in this year#creati | |
#using pledge_per_backer and goal_reached (computed earlier) and averaging it by category in a launch year belonging to a goal bucket | |
ks_ppb_goal=pd.DataFrame(kick_projects.groupby(['category','launched_year','goal_cat_perc'])['pledge_per_backer','goal_reached'].mean()) | |
#since the above table has all group by columns created as index, converting them into columns | |
ks_ppb_goal.reset_index(inplace=True) | |
#renaming column | |
ks_ppb_goal.columns= ['category','launched_year','goal_cat_perc','avg_ppb_goal','avg_success_rate_goal'] | |
#creating a metric: the success rate ie average pledged to goal ratio for the category in this year according to the duration buckets | |
ks_ppb_duration=pd.DataFrame(kick_projects.groupby(['category','launched_year','duration_cat_perc'])['goal_reached'].mean()) | |
#since the above table has all group by columns created as index, converting them into columns | |
ks_ppb_duration.reset_index(inplace=True) | |
#renaming column | |
ks_ppb_duration.columns= ['category','launched_year','duration_cat_perc','avg_success_rate_duration'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment