Skip to content

Instantly share code, notes, and snippets.

View j450h1's full-sized avatar

Jas Sohi j450h1

View GitHub Profile
# evaluate the model with test set
evaluator = MulticlassClassificationEvaluator()
print('F1-Score ', evaluator.evaluate(prediction {evaluator.metricName: 'f1'}))
F1-Score 0.6736596736596737
f1_gbt = MulticlassClassificationEvaluator(labelCol="indexedLabel", metricName='f1').evaluate(predictions_gbt)
print('F1', f1_gbt)
F1 0.7115836101882613
rf_f1 = MulticlassClassificationEvaluator(labelCol="indexedLabel",metricName='f1').evaluate(predictions)
print('F1 Score', rf_f1)
F1 Score 0.6919632934386234
+------+------------------+------------------+--------------+--------------+------------------+
|gender|subscription_level|auth_logged_in_cnt|auth_guest_cnt|status_404_cnt|page_next_song_cnt|
+------+------------------+------------------+--------------+--------------+------------------+
| F| free| 11| 4| 6| 9|
+------+------------------+------------------+--------------+--------------+------------------+
user_count = user_df.groupby('churn').count()
user_count = user_count.withColumn('percent', col('count')/sum('count').over(Window.partitionBy()))
# multiply by 100 and round
user_count = user_count.withColumn("percent", round(user_count["percent"] * 100, 2))
user_count.orderBy('percent', ascending=False).show()
+-----+-----+-------+
|churn|count|percent|
+-----+-----+-------+
| 0| 173| 76.89|
def aggregate_to_user_level(df):
"""
Aggregate the selected features to the user level
"""
exprs = [\
sparkMax(col('churn')).alias('churn')\
,sparkMax(col('Gender')).alias('gender')\
,sparkMax(col('level')).alias('subscription_level')\
,sparkMax(col('device_type')).alias('device_type')\
,sparkMax(when(col("page") == 'Upgrade', 1).otherwise(0)).alias('page_upgraded')
def get_churned_users(df):
"""
Find out the users that cancelled so we can identify who churned.
Return updated dataframe with additional column identifying as such
"""
cancelled_ids = df.filter('page == "Cancellation Confirmation"').select("userId").distinct()
# Convert to list to be used to filter later
cancelled_ids = cancelled_ids.toPandas()['userId'].tolist()
# 1 when a user churned and 0 when they did not
df = df.withColumn("Churn", when((col("userId").isin(cancelled_ids)),lit('1')).otherwise(lit('0')))
def topn_values(column_name, n):
"""
Take a column name and find the most frequent values
"""
return df.groupby(column_name).count().sort('count', ascending=False).limit(n).toPandas()
+--------------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+---------------+-------------+---------+--------------------+------+-------------+--------------------+------+
| artist| auth|firstName|gender|itemInSession|lastName| length|level| location|method| page| registration|sessionId| song|status| ts| userAgent|userId|
+--------------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+---------------+-------------+---------+--------------------+------+-------------+--------------------+------+
| Martha Tilston|Logged In| Colin| M| 50| Freeman|277.89016| paid| Bakersfield, CA| PUT| NextSong|1538173362000| 29| Rockpools| 200|1538352117000|Mozilla/5.0 (Wind...| 30|
| Five Iron Frenzy|Logged In| Micah| M| 79| Long|236.09424| free|Boston-Cambridge-...| PUT|
root
|-- artist: string (nullable = true)
|-- auth: string (nullable = true)
|-- firstName: string (nullable = true)
|-- gender: string (nullable = true)
|-- itemInSession: long (nullable = true)
|-- lastName: string (nullable = true)
|-- length: double (nullable = true)
|-- level: string (nullable = true)
|-- location: string (nullable = true)