Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import praw
options = dict(client_id='',
client_secret='',
user_agent='')
reddit = praw.Reddit(**options)
sid = SentimentIntensityAnalyzer()
url = ''
[*posts] = reddit.subreddit('all').search(f"url:{url}", limit=1000)
posts_df = pd.DataFrame(
[(post.id, post.subreddit.display_name, post.title, post.score,
datetime.utcfromtimestamp(post.created_utc), post.url,
post.num_comments, post.upvote_ratio)
for post in posts],
columns=['id', 'subreddit', 'title', 'score', 'created',
'url', 'num_comments', 'upvote_ratio'])
posts_df[['subreddit', 'upvote_ratio']] \
.groupby('subreddit') \
.mean()['upvote_ratio'] \
.reset_index() \
.plot(kind='barh', x='subreddit', y='upvote_ratio',
title='Upvote ratio', legend=False) \
.xaxis \
.set_major_formatter(FuncFormatter(lambda x, _: f'{x * 100:.1f}%'))
plt.tight_layout()
# Number of comments
posts_df[['subreddit', 'num_comments']] \
.groupby('subreddit') \
.sum()['num_comments'] \
.reset_index() \
.plot(kind='barh', x='subreddit', y='num_comments',
title='Number of comments', legend=False)
plt.tight_layout()
# Score
posts_df[['subreddit', 'score']] \
.groupby('subreddit') \
.sum()['score'] \
.reset_index() \
.plot(kind='barh', x='subreddit', y='score', title='Score', legend=False)
plt.tight_layout()
# Title sentiments
posts_sentiments = posts_df.title.apply(sid.polarity_scores).apply(pd.Series)
posts_df = posts_df.assign(title_neg=posts_sentiments.neg,
title_neu=posts_sentiments.neu,
title_pos=posts_sentiments.pos,
title_compound=posts_sentiments['compound'])
posts_df[['subreddit', 'title_neg', 'title_neu', 'title_pos', 'title_compound']] \
.groupby('subreddit') \
.sum()[['title_neg', 'title_neu', 'title_pos', 'title_compound']] \
.reset_index() \
.rename(columns={'title_neg': 'Negative',
'title_pos': 'Positive',
'title_neu': 'Neutral',
'title_compound': 'Compound'}) \
.plot(kind='barh', x='subreddit', title='Title sentiments', legend=True)
plt.tight_layout()
def normalize(post):
[*subreddit_posts] = reddit.subreddit(post.subreddit.display_name).new(limit=1000)
subreddit_posts_df = pd.DataFrame([(post.id, post.score, post.num_comments)
for post in subreddit_posts],
columns=('id', 'score', 'num_comments'))
norm_score = ((post.score - subreddit_posts_df.score.mean())
/ (subreddit_posts_df.score.max() - subreddit_posts_df.score.min()))
norm_num_comments = ((post.num_comments - subreddit_posts_df.num_comments.mean())
/ (subreddit_posts_df.num_comments.max() - subreddit_posts_df.num_comments.min()))
return norm_score, norm_num_comments
normalized_vals = pd \
.DataFrame([normalize(post) for post in posts],
columns=['norm_score', 'norm_num_comments']) \
.fillna(0)
posts_df[['norm_score', 'norm_num_comments']] = normalized_vals
posts_df[['subreddit', 'norm_score', 'norm_num_comments']] \
.groupby('subreddit') \
.sum()[['norm_score', 'norm_num_comments']] \
.reset_index() \
.rename(columns={'norm_score': 'Normalized score',
'norm_num_comments': 'Normalized number of comments'}) \
.plot(kind='barh', x='subreddit',title='Normalized popularity')
plt.tight_layout()
posts_df[['subreddit', 'norm_score', 'norm_num_comments', 'title_compound']] \
.groupby('subreddit') \
.sum()[['norm_score', 'norm_num_comments', 'title_compound']] \
.reset_index() \
.plot(kind='barh', x='subreddit', title='Normalized', legend=True)
plt.tight_layout()
def handle_post_comments(post):
if not post.num_comments:
return
root_comments_df = pd.DataFrame([(comment.id, comment.body, comment.score)
for comment in post.comments.list()
if hasattr(comment, 'body')],
columns=['id', 'body', 'score'])
root_comments_df['norm_score'] = ((root_comments_df.score - root_comments_df.score.mean())
/ (root_comments_df.score.max() - root_comments_df.score.min()))
root_comments_sentiments = root_comments_df.body.apply(sid.polarity_scores).apply(pd.Series)
root_comments_df = root_comments_df.assign(body_neg=root_comments_sentiments.neg,
body_neu=root_comments_sentiments.neu,
body_pos=root_comments_sentiments.pos,
body_compound=root_comments_sentiments['compound'])
bucketed_root_comments = root_comments_df \
[root_comments_df.body_compound >= 0.6] \
.assign(bucket='pos_pos') \
.append(
root_comments_df[
(root_comments_df.body_compound >= 0.2)
& (root_comments_df.body_compound < 0.6)
].assign(bucket='pos_neu')
) \
.append(
root_comments_df[
(root_comments_df.body_compound >= -0.2)
& (root_comments_df.body_compound < 0.2)
].assign(bucket='neu_neu')
) \
.append(
root_comments_df[
(root_comments_df.body_compound >= -0.6)
& (root_comments_df.body_compound < -0.2)
].assign(bucket='neg_neu')
) \
.append(
root_comments_df[
root_comments_df.body_compound < -0.6
].assign(bucket='neg_neg')
)
buckets = bucketed_root_comments \
.groupby('bucket') \
.agg({'norm_score': 'mean', 'id': 'count'}) \
.rename(columns={'norm_score': 'norm_score', 'id': 'amount'}) \
.reset_index()
buckets['percent'] = buckets.amount / buckets.amount.sum()
post_comments_df = pd.DataFrame([post.id], columns=['post_id']).assign(key=0)
for bucket in buckets.bucket:
post_comments_df = buckets[buckets.bucket == bucket] \
[['norm_score', 'amount', 'percent']] \
.add_prefix(f'{bucket}_') \
.assign(key=0) \
.merge(post_comments_df, how='outer')
return post_comments_df \
.add_prefix('root_comments_') \
.assign(key=post.id)
posts_comments_df = pd \
.concat([handle_post_comments(post) for post in posts]) \
.fillna(0)
posts_with_comments_df = posts_df \
.assign(key=lambda x: x.id) \
.merge(posts_comments_df, on='key', how='left') \
.fillna(0)
percent_columns = ['root_comments_neg_neg_percent',
'root_comments_neg_neu_percent', 'root_comments_neu_neu_percent',
'root_comments_pos_neu_percent', 'root_comments_pos_pos_percent']
posts_with_comments_df[['subreddit'] + percent_columns] \
.groupby('subreddit') \
.mean()[percent_columns] \
.reset_index() \
.rename(columns={column: column[13:-7].replace('_', ' ')
for column in percent_columns}) \
.plot(kind='bar', x='subreddit', legend=True,
title='Percent of comments by sentiments buckets') \
.yaxis \
.set_major_formatter(FuncFormatter(lambda y, _: f'{y * 100:.1f}%'))
plt.tight_layout()
norm_score_columns = ['root_comments_neg_neg_norm_score',
'root_comments_neg_neu_norm_score',
'root_comments_neu_neu_norm_score',
'root_comments_pos_neu_norm_score',
'root_comments_pos_pos_norm_score']
posts_with_comments_df[['subreddit'] + norm_score_columns] \
.groupby('subreddit') \
.mean()[norm_score_columns] \
.reset_index() \
.rename(columns={column: column[13:-10].replace('_', ' ')
for column in norm_score_columns}) \
.plot(kind='bar', x='subreddit', legend=True,
title='Mean normalized score of comments by sentiments buckets')
plt.tight_layout()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.