bh1995/word_score

## word_score
import re
# Number of reviews to use
N_total = 32261216
# Set word occurence lower threshold (must occur in 0.5 percent of the number of reviews)
N_limit = 0.005 * N_total
score_threshold = 90

comments_cleaned = reviews_with_score_df\
    .select(['comments', 'mean_score'])\
    .limit(N_total)\
    .repartition(4)\
    .rdd\
    .filter(lambda x: x[1] <= 100)\
    .filter(lambda x: x[0] is not None and len(x[0]) > 0)\
    .map(lambda x: (x[0].lower().replace('\'', ''), x[1]))\
    .flatMap(lambda x: [(y, x[1]) for y in set(re.findall(r"[\w']+", x[0]))])\
    .filter(lambda x: len(x[0]) > 3)\
    .map(lambda x: (x[0], (1, x[1] - score_threshold)))\
    .reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1]))\
    .filter(lambda x: x[1][0] > N_limit)\
    .map(lambda x: (x[0], x[1][1] / x[1][0]))\
    .sortBy(lambda x: x[1], ascending=False) # Sort by second value, i.e. score

result = comments_cleaned.collect()
# print('Top 10 words:', result[:10])
# print('Bottom 10 words:', result[-10:])
	import re
	# Number of reviews to use
	N_total = 32261216
	# Set word occurence lower threshold (must occur in 0.5 percent of the number of reviews)
	N_limit = 0.005 * N_total
	score_threshold = 90

	comments_cleaned = reviews_with_score_df\
	.select(['comments', 'mean_score'])\
	.limit(N_total)\
	.repartition(4)\
	.rdd\
	.filter(lambda x: x[1] <= 100)\
	.filter(lambda x: x[0] is not None and len(x[0]) > 0)\
	.map(lambda x: (x[0].lower().replace('\'', ''), x[1]))\
	.flatMap(lambda x: [(y, x[1]) for y in set(re.findall(r"[\w']+", x[0]))])\
	.filter(lambda x: len(x[0]) > 3)\
	.map(lambda x: (x[0], (1, x[1] - score_threshold)))\
	.reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1]))\
	.filter(lambda x: x[1][0] > N_limit)\
	.map(lambda x: (x[0], x[1][1] / x[1][0]))\
	.sortBy(lambda x: x[1], ascending=False) # Sort by second value, i.e. score

	result = comments_cleaned.collect()
	# print('Top 10 words:', result[:10])
	# print('Bottom 10 words:', result[-10:])