vansika/queries

## queries
top_artists_df = df.select('mb_artist_credit_id', 'user_name') \
                   .groupBy('mb_artist_credit_id', 'user_name') \
                   .agg(func.count('mb_artist_credit_id').alias('count'))

window = Window.partitionBy('user_name').orderBy(col('count').desc())

top_artists_df = top_artists_df.withColumn('rank', row_number().over(window)) \
                   .where(col('rank') <= config.TOP_ARTISTS_LIMIT) \
                   .select('mb_artist_credit_id', 'user_name')

Distinct mb_artist_credit_id = 11307

-----------------------------------------------------------------------------------------------------------
df1 = top_artists_df.join(artists_relation_df, top_artists_df.mb_artist_credit_id == artists_relation_df.id_0, 'inner') \
                    .select(
                              col('id_0').alias('top_artist_credit_id'),
                              col('id_1').alias('similar_artist_credit_id'),
                              'score',
                              'user_name'
                     )

df2 = top_artists_df.join(artists_relation_df, top_artists_df.mb_artist_credit_id == artists_relation_df.id_1, 'inner')
                    .select(
                              col('id_1').alias('top_artist_credit_id'),
                              col('id_0').alias('similar_artist_credit_id'),
                              'score',
                              'user_name'
                    )

similar_artists_df = df1.union(df2)


Distinct top_artist_credit_id = 7348

---------------------------------------------------------------------------------------------------------------

When I do this : top_artists_df.subtract(similar_artists_df), I get 4319.

df1.subtract(df2) basically displays rows in df1 which are not in df2.

11307 - 7348 = 3959 but we get 4319 that means there are some artist credit ids in similar_artists_df which are not in
top_artists_df.

When we look at similar_artists_df query we see that there is a join between the artist _relation and top_artists_df.

Any credit_id in similar_artists_df should be in top_artists_df because there is join but the vice verse is not true since
artists_artist_relation does not contain credit ids for all the artists which ever exist.

These extra credit ids in similar_artists_df is my concern
	top_artists_df = df.select('mb_artist_credit_id', 'user_name') \
	.groupBy('mb_artist_credit_id', 'user_name') \
	.agg(func.count('mb_artist_credit_id').alias('count'))

	window = Window.partitionBy('user_name').orderBy(col('count').desc())

	top_artists_df = top_artists_df.withColumn('rank', row_number().over(window)) \
	.where(col('rank') <= config.TOP_ARTISTS_LIMIT) \
	.select('mb_artist_credit_id', 'user_name')

	Distinct mb_artist_credit_id = 11307

	-----------------------------------------------------------------------------------------------------------
	df1 = top_artists_df.join(artists_relation_df, top_artists_df.mb_artist_credit_id == artists_relation_df.id_0, 'inner') \
	.select(
	col('id_0').alias('top_artist_credit_id'),
	col('id_1').alias('similar_artist_credit_id'),
	'score',
	'user_name'
	)

	df2 = top_artists_df.join(artists_relation_df, top_artists_df.mb_artist_credit_id == artists_relation_df.id_1, 'inner')
	.select(
	col('id_1').alias('top_artist_credit_id'),
	col('id_0').alias('similar_artist_credit_id'),
	'score',
	'user_name'
	)

	similar_artists_df = df1.union(df2)


	Distinct top_artist_credit_id = 7348

	---------------------------------------------------------------------------------------------------------------

	When I do this : top_artists_df.subtract(similar_artists_df), I get 4319.

	df1.subtract(df2) basically displays rows in df1 which are not in df2.

	11307 - 7348 = 3959 but we get 4319 that means there are some artist credit ids in similar_artists_df which are not in
	top_artists_df.

	When we look at similar_artists_df query we see that there is a join between the artist _relation and top_artists_df.

	Any credit_id in similar_artists_df should be in top_artists_df because there is join but the vice verse is not true since
	artists_artist_relation does not contain credit ids for all the artists which ever exist.

	These extra credit ids in similar_artists_df is my concern