Jas Sohi j450h1

## original_f1score
# evaluate the model with test set
evaluator = MulticlassClassificationEvaluator()
print('F1-Score ', evaluator.evaluate(prediction {evaluator.metricName: 'f1'}))
F1-Score 0.6736596736596737

## gbt_f1_score
f1_gbt = MulticlassClassificationEvaluator(labelCol="indexedLabel", metricName='f1').evaluate(predictions_gbt)
print('F1', f1_gbt)

F1 0.7115836101882613

## random_forest_f1
rf_f1 = MulticlassClassificationEvaluator(labelCol="indexedLabel",metricName='f1').evaluate(predictions)
print('F1 Score', rf_f1)

F1 Score 0.6919632934386234

## logistic_regression_6_features
+------+------------------+------------------+--------------+--------------+------------------+
|gender|subscription_level|auth_logged_in_cnt|auth_guest_cnt|status_404_cnt|page_next_song_cnt|
+------+------------------+------------------+--------------+--------------+------------------+
|     F|              free|                11|             4|             6|                 9|
+------+------------------+------------------+--------------+--------------+------------------+

## gist:57cef2992d91787f341fa92e193adc10
user_count = user_df.groupby('churn').count()
user_count = user_count.withColumn('percent', col('count')/sum('count').over(Window.partitionBy()))
# multiply by 100 and round
user_count = user_count.withColumn("percent", round(user_count["percent"] * 100, 2))
user_count.orderBy('percent', ascending=False).show()

+-----+-----+-------+
|churn|count|percent|
+-----+-----+-------+
|    0|  173|  76.89|

## aggregate_to_user_level
def aggregate_to_user_level(df):
    """
    Aggregate the selected features to the user level
    """
    exprs = [\
    sparkMax(col('churn')).alias('churn')\
    ,sparkMax(col('Gender')).alias('gender')\
    ,sparkMax(col('level')).alias('subscription_level')\
    ,sparkMax(col('device_type')).alias('device_type')\
    ,sparkMax(when(col("page") == 'Upgrade', 1).otherwise(0)).alias('page_upgraded')

## get_churned_users
def get_churned_users(df):
    """
    Find out the users that cancelled so we can identify who churned.
    Return updated dataframe with additional column identifying as such
    """
    cancelled_ids = df.filter('page == "Cancellation Confirmation"').select("userId").distinct()
    # Convert to list to be used to filter later
    cancelled_ids = cancelled_ids.toPandas()['userId'].tolist()
    # 1 when a user churned and 0 when they did not
    df = df.withColumn("Churn", when((col("userId").isin(cancelled_ids)),lit('1')).otherwise(lit('0')))

## topn_values
def topn_values(column_name, n):
    """
    Take a column name and find the most frequent values
    """
    return df.groupby(column_name).count().sort('count', ascending=False).limit(n).toPandas()

## df.show(5)
+--------------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+---------------+-------------+---------+--------------------+------+-------------+--------------------+------+
|              artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|           page| registration|sessionId|                song|status|           ts|           userAgent|userId|
+--------------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+---------------+-------------+---------+--------------------+------+-------------+--------------------+------+
|      Martha Tilston|Logged In|    Colin|     M|           50| Freeman|277.89016| paid|     Bakersfield, CA|   PUT|       NextSong|1538173362000|       29|           Rockpools|   200|1538352117000|Mozilla/5.0 (Wind...|    30|
|    Five Iron Frenzy|Logged In|    Micah|     M|           79|    Long|236.09424| free|Boston-Cambridge-...|   PUT|

## df.printSchema()
root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
	# evaluate the model with test set
	evaluator = MulticlassClassificationEvaluator()
	print('F1-Score ', evaluator.evaluate(prediction {evaluator.metricName: 'f1'}))
	F1-Score 0.6736596736596737
	f1_gbt = MulticlassClassificationEvaluator(labelCol="indexedLabel", metricName='f1').evaluate(predictions_gbt)
	print('F1', f1_gbt)

	F1 0.7115836101882613
	rf_f1 = MulticlassClassificationEvaluator(labelCol="indexedLabel",metricName='f1').evaluate(predictions)
	print('F1 Score', rf_f1)

	F1 Score 0.6919632934386234
	+------+------------------+------------------+--------------+--------------+------------------+
	\|gender\|subscription_level\|auth_logged_in_cnt\|auth_guest_cnt\|status_404_cnt\|page_next_song_cnt\|
	+------+------------------+------------------+--------------+--------------+------------------+
	\| F\| free\| 11\| 4\| 6\| 9\|
	+------+------------------+------------------+--------------+--------------+------------------+
	user_count = user_df.groupby('churn').count()
	user_count = user_count.withColumn('percent', col('count')/sum('count').over(Window.partitionBy()))
	# multiply by 100 and round
	user_count = user_count.withColumn("percent", round(user_count["percent"] * 100, 2))
	user_count.orderBy('percent', ascending=False).show()

	+-----+-----+-------+
	\|churn\|count\|percent\|
	+-----+-----+-------+
	\| 0\| 173\| 76.89\|
	def aggregate_to_user_level(df):
	"""
	Aggregate the selected features to the user level
	"""
	exprs = [\
	sparkMax(col('churn')).alias('churn')\
	,sparkMax(col('Gender')).alias('gender')\
	,sparkMax(col('level')).alias('subscription_level')\
	,sparkMax(col('device_type')).alias('device_type')\
	,sparkMax(when(col("page") == 'Upgrade', 1).otherwise(0)).alias('page_upgraded')
	def get_churned_users(df):
	"""
	Find out the users that cancelled so we can identify who churned.
	Return updated dataframe with additional column identifying as such
	"""
	cancelled_ids = df.filter('page == "Cancellation Confirmation"').select("userId").distinct()
	# Convert to list to be used to filter later
	cancelled_ids = cancelled_ids.toPandas()['userId'].tolist()
	# 1 when a user churned and 0 when they did not
	df = df.withColumn("Churn", when((col("userId").isin(cancelled_ids)),lit('1')).otherwise(lit('0')))
	def topn_values(column_name, n):
	"""
	Take a column name and find the most frequent values
	"""
	return df.groupby(column_name).count().sort('count', ascending=False).limit(n).toPandas()
	+--------------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+---------------+-------------+---------+--------------------+------+-------------+--------------------+------+
	\| artist\| auth\|firstName\|gender\|itemInSession\|lastName\| length\|level\| location\|method\| page\| registration\|sessionId\| song\|status\| ts\| userAgent\|userId\|
	+--------------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+---------------+-------------+---------+--------------------+------+-------------+--------------------+------+
	\| Martha Tilston\|Logged In\| Colin\| M\| 50\| Freeman\|277.89016\| paid\| Bakersfield, CA\| PUT\| NextSong\|1538173362000\| 29\| Rockpools\| 200\|1538352117000\|Mozilla/5.0 (Wind...\| 30\|
	\| Five Iron Frenzy\|Logged In\| Micah\| M\| 79\| Long\|236.09424\| free\|Boston-Cambridge-...\| PUT\|
	root
	\|-- artist: string (nullable = true)
	\|-- auth: string (nullable = true)
	\|-- firstName: string (nullable = true)
	\|-- gender: string (nullable = true)
	\|-- itemInSession: long (nullable = true)
	\|-- lastName: string (nullable = true)
	\|-- length: double (nullable = true)
	\|-- level: string (nullable = true)
	\|-- location: string (nullable = true)