oaguy1/corpus_eda.py

## corpus_eda.py
import pandas as pd
import praw
from praw.models import Comment

client_id = "your actual client id"
client_secret = "your client secret"
user_agent = "your user agent"

reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent)

# reminder, text from reddit is generally nsfw
sub_name = "politics"
max_posts = 1000
sub = reddit.subreddit(sub_name)

comments = []
comments_growth = []

nlp = spacy.load('en_core_web_sm')

for i, submission in enumerate(sub.hot(limit=max_posts)):
    # extract all the comments into an array of strings, this accounts for placeholder comments
    # that tell praw to grab more comments
    extracted =  [nlp(comm.body) for comm in submission.comments if type(comm) == Comment]

    # append extracted comments to our corpus
    comments += extracted

    # store how many comments were stored
    comments_growth.append(len(comments))

flatten_lists = lambda corpus: list(itertools.chain.from_iterable(corpus))

analysis = pd.DataFrame()
analysis['corpus'] = pd.Series({ i: len(list(flatten_lists(comments[:last_comment_index]))) for i, last_comment_index in enumerate(comments_growth) })
analysis['vocab'] = pd.Series({ i: len(set(flatten_lists(comments[:last_comment_index]))) for i, last_comment_index in enumerate(comments_growth) })

plt = analysis.plot(title=f"Comment Growth in r/{sub_name}")
plt.set_xlabel("No of Posts")
plt.set_ylabel("No of Comments")
	import pandas as pd
	import praw
	from praw.models import Comment

	client_id = "your actual client id"
	client_secret = "your client secret"
	user_agent = "your user agent"

	reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent)

	# reminder, text from reddit is generally nsfw
	sub_name = "politics"
	max_posts = 1000
	sub = reddit.subreddit(sub_name)

	comments = []
	comments_growth = []

	nlp = spacy.load('en_core_web_sm')

	for i, submission in enumerate(sub.hot(limit=max_posts)):
	# extract all the comments into an array of strings, this accounts for placeholder comments
	# that tell praw to grab more comments
	extracted = [nlp(comm.body) for comm in submission.comments if type(comm) == Comment]

	# append extracted comments to our corpus
	comments += extracted

	# store how many comments were stored
	comments_growth.append(len(comments))

	flatten_lists = lambda corpus: list(itertools.chain.from_iterable(corpus))

	analysis = pd.DataFrame()
	analysis['corpus'] = pd.Series({ i: len(list(flatten_lists(comments[:last_comment_index]))) for i, last_comment_index in enumerate(comments_growth) })
	analysis['vocab'] = pd.Series({ i: len(set(flatten_lists(comments[:last_comment_index]))) for i, last_comment_index in enumerate(comments_growth) })

	plt = analysis.plot(title=f"Comment Growth in r/{sub_name}")
	plt.set_xlabel("No of Posts")
	plt.set_ylabel("No of Comments")