Skip to content

Instantly share code, notes, and snippets.

@bh1995
Last active January 13, 2021 22:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bh1995/21c84f1be18327c7cd27a153e655f320 to your computer and use it in GitHub Desktop.
Save bh1995/21c84f1be18327c7cd27a153e655f320 to your computer and use it in GitHub Desktop.
import re
# Number of reviews to use
N_total = 32261216
# Set word occurence lower threshold (must occur in 0.5 percent of the number of reviews)
N_limit = 0.005 * N_total
score_threshold = 90
comments_cleaned = reviews_with_score_df\
.select(['comments', 'mean_score'])\
.limit(N_total)\
.repartition(4)\
.rdd\
.filter(lambda x: x[1] <= 100)\
.filter(lambda x: x[0] is not None and len(x[0]) > 0)\
.map(lambda x: (x[0].lower().replace('\'', ''), x[1]))\
.flatMap(lambda x: [(y, x[1]) for y in set(re.findall(r"[\w']+", x[0]))])\
.filter(lambda x: len(x[0]) > 3)\
.map(lambda x: (x[0], (1, x[1] - score_threshold)))\
.reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1]))\
.filter(lambda x: x[1][0] > N_limit)\
.map(lambda x: (x[0], x[1][1] / x[1][0]))\
.sortBy(lambda x: x[1], ascending=False) # Sort by second value, i.e. score
result = comments_cleaned.collect()
# print('Top 10 words:', result[:10])
# print('Bottom 10 words:', result[-10:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment