srishtis/kickstarter_feature_engineering.py

## kickstarter_feature_engineering.py

#Creating Goal and Duration percentile buckets
#will create percentile buckets for the goal amount in a category#will c
kick_projects['goal_cat_perc'] =  kick_projects.groupby(['category'])['goal'].transform(
                     lambda x: pd.qcut(x, [0, .35, .70, 1.0], labels =[1,2,3]))

#will create percentile buckets for the duration in a category
kick_projects['duration_cat_perc'] =  kick_projects.groupby(['category'])['duration'].transform(
                     lambda x: pd.qcut(x, [0, .35, .70, 1.0], labels =False, duplicates='drop'))

####################################################################################################
#using above columns to create PARTICIPANTS features for three time periods: quarter, week and month
####################################################################################################

#creating a metric to see number of competitors for a given project in a given quarter
#number of participants in a given category, that launched in the same year and quarter and in the same goal bucket
ks_particpants_qtr=kick_projects.groupby(['category','launched_year','launched_quarter','goal_cat_perc']).count()
ks_particpants_qtr=ks_particpants_qtr[['name']]
#since the above table has all group by columns created as index, converting them into columns
ks_particpants_qtr.reset_index(inplace=True)

#creating a metric to see number of competitors for a given project in a given month
#number of participants in a given category, that launched in the same year and month and in the same goal bucket
ks_particpants_mth=kick_projects.groupby(['category','launched_year','launched_month','goal_cat_perc']).count()
ks_particpants_mth=ks_particpants_mth[['name']]
#since the above table has all group by columns created as index, converting them into columns
ks_particpants_mth.reset_index(inplace=True)

#creating a metric to see number of competitors for a given project in a given week
#number of participants in a given category, that launched in the same year and week and in the same goal bucket
ks_particpants_wk=kick_projects.groupby(['category','launched_year','launched_week','goal_cat_perc']).count()
ks_particpants_wk=ks_particpants_wk[['name']]
#since the above table has all group by columns created as index, converting them into columns
ks_particpants_wk.reset_index(inplace=True)


#additional features from goal, pledge and backers columns
kick_projects.loc[:,'goal_reached'] = kick_projects['pledged'] / kick_projects['goal'] # Pledged amount as a percentage of goal.
#The above field will be used to compute another metric
# In backers column, impute 0 with 1 to prevent undefined division.
kick_projects.loc[kick_projects['backers'] == 0, 'backers'] = 1
kick_projects.loc[:,'pledge_per_backer'] = kick_projects['pledged'] / kick_projects['backers'] # Pledged amount per backer.

#####################################################################
# using these metrics to create AVG_PPB and AVG_SUCCESS_RATE features
#####################################################################

#creating 2 metrics to get average pledge per backer for a category in a year according to the goal bucket it lies in and the success rate ie average pledged to goal ratio for the category and goal bucket in this year#creati
#using pledge_per_backer and goal_reached (computed earlier) and averaging it by category in a launch year belonging to a goal bucket
ks_ppb_goal=pd.DataFrame(kick_projects.groupby(['category','launched_year','goal_cat_perc'])['pledge_per_backer','goal_reached'].mean())
#since the above table has all group by columns created as index, converting them into columns
ks_ppb_goal.reset_index(inplace=True)
#renaming column
ks_ppb_goal.columns= ['category','launched_year','goal_cat_perc','avg_ppb_goal','avg_success_rate_goal']

#creating a metric: the success rate ie average pledged to goal ratio for the category in this year according to the duration buckets
ks_ppb_duration=pd.DataFrame(kick_projects.groupby(['category','launched_year','duration_cat_perc'])['goal_reached'].mean())
#since the above table has all group by columns created as index, converting them into columns
ks_ppb_duration.reset_index(inplace=True)
#renaming column
ks_ppb_duration.columns= ['category','launched_year','duration_cat_perc','avg_success_rate_duration']

	#Creating Goal and Duration percentile buckets
	#will create percentile buckets for the goal amount in a category#will c
	kick_projects['goal_cat_perc'] = kick_projects.groupby(['category'])['goal'].transform(
	lambda x: pd.qcut(x, [0, .35, .70, 1.0], labels =[1,2,3]))

	#will create percentile buckets for the duration in a category
	kick_projects['duration_cat_perc'] = kick_projects.groupby(['category'])['duration'].transform(
	lambda x: pd.qcut(x, [0, .35, .70, 1.0], labels =False, duplicates='drop'))

	####################################################################################################
	#using above columns to create PARTICIPANTS features for three time periods: quarter, week and month
	####################################################################################################

	#creating a metric to see number of competitors for a given project in a given quarter
	#number of participants in a given category, that launched in the same year and quarter and in the same goal bucket
	ks_particpants_qtr=kick_projects.groupby(['category','launched_year','launched_quarter','goal_cat_perc']).count()
	ks_particpants_qtr=ks_particpants_qtr[['name']]
	#since the above table has all group by columns created as index, converting them into columns
	ks_particpants_qtr.reset_index(inplace=True)

	#creating a metric to see number of competitors for a given project in a given month
	#number of participants in a given category, that launched in the same year and month and in the same goal bucket
	ks_particpants_mth=kick_projects.groupby(['category','launched_year','launched_month','goal_cat_perc']).count()
	ks_particpants_mth=ks_particpants_mth[['name']]
	#since the above table has all group by columns created as index, converting them into columns
	ks_particpants_mth.reset_index(inplace=True)

	#creating a metric to see number of competitors for a given project in a given week
	#number of participants in a given category, that launched in the same year and week and in the same goal bucket
	ks_particpants_wk=kick_projects.groupby(['category','launched_year','launched_week','goal_cat_perc']).count()
	ks_particpants_wk=ks_particpants_wk[['name']]
	#since the above table has all group by columns created as index, converting them into columns
	ks_particpants_wk.reset_index(inplace=True)


	#additional features from goal, pledge and backers columns
	kick_projects.loc[:,'goal_reached'] = kick_projects['pledged'] / kick_projects['goal'] # Pledged amount as a percentage of goal.
	#The above field will be used to compute another metric
	# In backers column, impute 0 with 1 to prevent undefined division.
	kick_projects.loc[kick_projects['backers'] == 0, 'backers'] = 1
	kick_projects.loc[:,'pledge_per_backer'] = kick_projects['pledged'] / kick_projects['backers'] # Pledged amount per backer.

	#####################################################################
	# using these metrics to create AVG_PPB and AVG_SUCCESS_RATE features
	#####################################################################

	#creating 2 metrics to get average pledge per backer for a category in a year according to the goal bucket it lies in and the success rate ie average pledged to goal ratio for the category and goal bucket in this year#creati
	#using pledge_per_backer and goal_reached (computed earlier) and averaging it by category in a launch year belonging to a goal bucket
	ks_ppb_goal=pd.DataFrame(kick_projects.groupby(['category','launched_year','goal_cat_perc'])['pledge_per_backer','goal_reached'].mean())
	#since the above table has all group by columns created as index, converting them into columns
	ks_ppb_goal.reset_index(inplace=True)
	#renaming column
	ks_ppb_goal.columns= ['category','launched_year','goal_cat_perc','avg_ppb_goal','avg_success_rate_goal']

	#creating a metric: the success rate ie average pledged to goal ratio for the category in this year according to the duration buckets
	ks_ppb_duration=pd.DataFrame(kick_projects.groupby(['category','launched_year','duration_cat_perc'])['goal_reached'].mean())
	#since the above table has all group by columns created as index, converting them into columns
	ks_ppb_duration.reset_index(inplace=True)
	#renaming column
	ks_ppb_duration.columns= ['category','launched_year','duration_cat_perc','avg_success_rate_duration']