Skip to content

Instantly share code, notes, and snippets.

@bh1995
Created January 13, 2021 22:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bh1995/1f9bdea2912cb9197c4d6813f61d9e4a to your computer and use it in GitHub Desktop.
Save bh1995/1f9bdea2912cb9197c4d6813f61d9e4a to your computer and use it in GitHub Desktop.
# Filter all reviews
listings_score_only_df = listings_df.select(['id', 'review_scores_rating']).cache()
listings_score_only_df = listings_score_only_df.withColumn('id',
listings_score_only_df['id'].cast(IntegerType()))
listings_score_only_df = listings_score_only_df.filter(col('id').isNotNull())
listings_score_only_df = listings_score_only_df.withColumn('review_scores_rating',
listings_score_only_df['review_scores_rating'].cast(IntegerType()))
listings_score_only_df = listings_score_only_df.filter(col('review_scores_rating').isNotNull())
listings_score_only_df = listings_df.select(['id', 'review_scores_rating']).cache()
listings_score_only_df = listings_score_only_df.withColumn('id',
listings_score_only_df['id'].cast(IntegerType()))
listings_score_only_df = listings_score_only_df.filter(col('id').isNotNull())
listings_score_only_df = listings_score_only_df.withColumn('review_scores_rating',
listings_score_only_df['review_scores_rating'].cast(IntegerType()))
listings_score_only_df = listings_score_only_df.filter(col('review_scores_rating').isNotNull())
reviews_with_score_df = reviews_df.select(['listing_id', 'comments']).cache()
reviews_with_score_df = reviews_with_score_df.withColumn('listing_id',
reviews_with_score_df['listing_id'].cast(IntegerType()))
# Remove any rows with null values
reviews_with_score_df = reviews_with_score_df.filter(col('listing_id').isNotNull())
reviews_with_score_df = reviews_with_score_df.filter(col('comments').isNotNull())
# join all listing reviews and mean score for the given property
reviews_with_score_df = reviews_with_score_df.join(listings_score_only_df,
reviews_with_score_df.listing_id == listings_score_only_df.id)\
.select(reviews_with_score_df['*'],
listings_score_only_df['review_scores_rating'].alias('mean_score')).cache
# View some of the reviews for the first property
print(reviews_with_score_df.show())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment