Skip to content

Instantly share code, notes, and snippets.

@oaguy1
Last active June 5, 2020 19:16
Show Gist options
  • Save oaguy1/599cbb1046fde618faa41624369249da to your computer and use it in GitHub Desktop.
Save oaguy1/599cbb1046fde618faa41624369249da to your computer and use it in GitHub Desktop.
Simple Reddit EDA for NLP
import pandas as pd
import praw
from praw.models import Comment
client_id = "your actual client id"
client_secret = "your client secret"
user_agent = "your user agent"
reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent)
# reminder, text from reddit is generally nsfw
sub_name = "politics"
max_posts = 1000
sub = reddit.subreddit(sub_name)
comments = []
comments_growth = []
nlp = spacy.load('en_core_web_sm')
for i, submission in enumerate(sub.hot(limit=max_posts)):
# extract all the comments into an array of strings, this accounts for placeholder comments
# that tell praw to grab more comments
extracted = [nlp(comm.body) for comm in submission.comments if type(comm) == Comment]
# append extracted comments to our corpus
comments += extracted
# store how many comments were stored
comments_growth.append(len(comments))
flatten_lists = lambda corpus: list(itertools.chain.from_iterable(corpus))
analysis = pd.DataFrame()
analysis['corpus'] = pd.Series({ i: len(list(flatten_lists(comments[:last_comment_index]))) for i, last_comment_index in enumerate(comments_growth) })
analysis['vocab'] = pd.Series({ i: len(set(flatten_lists(comments[:last_comment_index]))) for i, last_comment_index in enumerate(comments_growth) })
plt = analysis.plot(title=f"Comment Growth in r/{sub_name}")
plt.set_xlabel("No of Posts")
plt.set_ylabel("No of Comments")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment