Skip to content

Instantly share code, notes, and snippets.

@nvbn
Created November 27, 2018 23:37
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save nvbn/ece1528ff5af2ecac6d2ee39234287ea to your computer and use it in GitHub Desktop.
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import praw
options = dict(client_id='',
client_secret='',
user_agent='')
reddit = praw.Reddit(**options)
sid = SentimentIntensityAnalyzer()
url = ''
[*posts] = reddit.subreddit('all').search(f"url:{url}", limit=1000)
posts_df = pd.DataFrame(
[(post.id, post.subreddit.display_name, post.title, post.score,
datetime.utcfromtimestamp(post.created_utc), post.url,
post.num_comments, post.upvote_ratio)
for post in posts],
columns=['id', 'subreddit', 'title', 'score', 'created',
'url', 'num_comments', 'upvote_ratio'])
posts_df[['subreddit', 'upvote_ratio']] \
.groupby('subreddit') \
.mean()['upvote_ratio'] \
.reset_index() \
.plot(kind='barh', x='subreddit', y='upvote_ratio',
title='Upvote ratio', legend=False) \
.xaxis \
.set_major_formatter(FuncFormatter(lambda x, _: f'{x * 100:.1f}%'))
plt.tight_layout()
# Number of comments
posts_df[['subreddit', 'num_comments']] \
.groupby('subreddit') \
.sum()['num_comments'] \
.reset_index() \
.plot(kind='barh', x='subreddit', y='num_comments',
title='Number of comments', legend=False)
plt.tight_layout()
# Score
posts_df[['subreddit', 'score']] \
.groupby('subreddit') \
.sum()['score'] \
.reset_index() \
.plot(kind='barh', x='subreddit', y='score', title='Score', legend=False)
plt.tight_layout()
# Title sentiments
posts_sentiments = posts_df.title.apply(sid.polarity_scores).apply(pd.Series)
posts_df = posts_df.assign(title_neg=posts_sentiments.neg,
title_neu=posts_sentiments.neu,
title_pos=posts_sentiments.pos,
title_compound=posts_sentiments['compound'])
posts_df[['subreddit', 'title_neg', 'title_neu', 'title_pos', 'title_compound']] \
.groupby('subreddit') \
.sum()[['title_neg', 'title_neu', 'title_pos', 'title_compound']] \
.reset_index() \
.rename(columns={'title_neg': 'Negative',
'title_pos': 'Positive',
'title_neu': 'Neutral',
'title_compound': 'Compound'}) \
.plot(kind='barh', x='subreddit', title='Title sentiments', legend=True)
plt.tight_layout()
def normalize(post):
[*subreddit_posts] = reddit.subreddit(post.subreddit.display_name).new(limit=1000)
subreddit_posts_df = pd.DataFrame([(post.id, post.score, post.num_comments)
for post in subreddit_posts],
columns=('id', 'score', 'num_comments'))
norm_score = ((post.score - subreddit_posts_df.score.mean())
/ (subreddit_posts_df.score.max() - subreddit_posts_df.score.min()))
norm_num_comments = ((post.num_comments - subreddit_posts_df.num_comments.mean())
/ (subreddit_posts_df.num_comments.max() - subreddit_posts_df.num_comments.min()))
return norm_score, norm_num_comments
normalized_vals = pd \
.DataFrame([normalize(post) for post in posts],
columns=['norm_score', 'norm_num_comments']) \
.fillna(0)
posts_df[['norm_score', 'norm_num_comments']] = normalized_vals
posts_df[['subreddit', 'norm_score', 'norm_num_comments']] \
.groupby('subreddit') \
.sum()[['norm_score', 'norm_num_comments']] \
.reset_index() \
.rename(columns={'norm_score': 'Normalized score',
'norm_num_comments': 'Normalized number of comments'}) \
.plot(kind='barh', x='subreddit',title='Normalized popularity')
plt.tight_layout()
posts_df[['subreddit', 'norm_score', 'norm_num_comments', 'title_compound']] \
.groupby('subreddit') \
.sum()[['norm_score', 'norm_num_comments', 'title_compound']] \
.reset_index() \
.plot(kind='barh', x='subreddit', title='Normalized', legend=True)
plt.tight_layout()
def handle_post_comments(post):
if not post.num_comments:
return
root_comments_df = pd.DataFrame([(comment.id, comment.body, comment.score)
for comment in post.comments.list()
if hasattr(comment, 'body')],
columns=['id', 'body', 'score'])
root_comments_df['norm_score'] = ((root_comments_df.score - root_comments_df.score.mean())
/ (root_comments_df.score.max() - root_comments_df.score.min()))
root_comments_sentiments = root_comments_df.body.apply(sid.polarity_scores).apply(pd.Series)
root_comments_df = root_comments_df.assign(body_neg=root_comments_sentiments.neg,
body_neu=root_comments_sentiments.neu,
body_pos=root_comments_sentiments.pos,
body_compound=root_comments_sentiments['compound'])
bucketed_root_comments = root_comments_df \
[root_comments_df.body_compound >= 0.6] \
.assign(bucket='pos_pos') \
.append(
root_comments_df[
(root_comments_df.body_compound >= 0.2)
& (root_comments_df.body_compound < 0.6)
].assign(bucket='pos_neu')
) \
.append(
root_comments_df[
(root_comments_df.body_compound >= -0.2)
& (root_comments_df.body_compound < 0.2)
].assign(bucket='neu_neu')
) \
.append(
root_comments_df[
(root_comments_df.body_compound >= -0.6)
& (root_comments_df.body_compound < -0.2)
].assign(bucket='neg_neu')
) \
.append(
root_comments_df[
root_comments_df.body_compound < -0.6
].assign(bucket='neg_neg')
)
buckets = bucketed_root_comments \
.groupby('bucket') \
.agg({'norm_score': 'mean', 'id': 'count'}) \
.rename(columns={'norm_score': 'norm_score', 'id': 'amount'}) \
.reset_index()
buckets['percent'] = buckets.amount / buckets.amount.sum()
post_comments_df = pd.DataFrame([post.id], columns=['post_id']).assign(key=0)
for bucket in buckets.bucket:
post_comments_df = buckets[buckets.bucket == bucket] \
[['norm_score', 'amount', 'percent']] \
.add_prefix(f'{bucket}_') \
.assign(key=0) \
.merge(post_comments_df, how='outer')
return post_comments_df \
.add_prefix('root_comments_') \
.assign(key=post.id)
posts_comments_df = pd \
.concat([handle_post_comments(post) for post in posts]) \
.fillna(0)
posts_with_comments_df = posts_df \
.assign(key=lambda x: x.id) \
.merge(posts_comments_df, on='key', how='left') \
.fillna(0)
percent_columns = ['root_comments_neg_neg_percent',
'root_comments_neg_neu_percent', 'root_comments_neu_neu_percent',
'root_comments_pos_neu_percent', 'root_comments_pos_pos_percent']
posts_with_comments_df[['subreddit'] + percent_columns] \
.groupby('subreddit') \
.mean()[percent_columns] \
.reset_index() \
.rename(columns={column: column[13:-7].replace('_', ' ')
for column in percent_columns}) \
.plot(kind='bar', x='subreddit', legend=True,
title='Percent of comments by sentiments buckets') \
.yaxis \
.set_major_formatter(FuncFormatter(lambda y, _: f'{y * 100:.1f}%'))
plt.tight_layout()
norm_score_columns = ['root_comments_neg_neg_norm_score',
'root_comments_neg_neu_norm_score',
'root_comments_neu_neu_norm_score',
'root_comments_pos_neu_norm_score',
'root_comments_pos_pos_norm_score']
posts_with_comments_df[['subreddit'] + norm_score_columns] \
.groupby('subreddit') \
.mean()[norm_score_columns] \
.reset_index() \
.rename(columns={column: column[13:-10].replace('_', ' ')
for column in norm_score_columns}) \
.plot(kind='bar', x='subreddit', legend=True,
title='Mean normalized score of comments by sentiments buckets')
plt.tight_layout()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment