Skip to content

Instantly share code, notes, and snippets.

Last active May 31, 2022 01:02
Show Gist options
  • Save NanoDano/ccab59b487dfa417c35d to your computer and use it in GitHub Desktop.
Save NanoDano/ccab59b487dfa417c35d to your computer and use it in GitHub Desktop.
Word Frequency of a Subreddit's Post Titles
Analyze word frequency in subreddit post titles
Outputs the number of times a word is seen and orders them
by the most used words.
Quick and dirty script that is not optimized
min_word_length = 5 # Ignore words shorter than this length
min_word_frequency = 4 # Only show words seen at least this many times
subreddit = 'houston'
request_limit = 1000 # Max 1000
import operator
import praw
import re
user_agent = ("Houston Subreddit Analyzer/0.1 by")
reddit = praw.Reddit(user_agent=user_agent)
wordbank = dict()
count = 1
posts = reddit.get_subreddit(subreddit).get_hot(limit=request_limit)
for post in posts:
count += 1
title = str(post).split(' :: ')[1] # First value is upvotes followed by ::
words = title.split()
for word in words:
# Strip special chars
word = re.sub('[^a-zA-Z0-9]', '', word)
# Ignore anything less than 4 chars
if len(word) < min_word_length:
# Lowercase to normalize
word = word.lower()
if not word in wordbank:
wordbank[word] = 1
wordbank[word] += 1
sorted_words = sorted(wordbank.items(), key=operator.itemgetter(1))
print "Processed " + repr(count) + " post titles."
for word in sorted_words:
# Ignore word that were seen less than X times
if word[1] > min_word_frequency - 1:
print repr(word[1]) + " " + word[0]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment