Skip to content

Instantly share code, notes, and snippets.

Created May 20, 2013 19:14
Show Gist options
  • Save anonymous/5614692 to your computer and use it in GitHub Desktop.
Save anonymous/5614692 to your computer and use it in GitHub Desktop.
Top of WAYWT Collector
import praw
import datetime
import time
import re
import mimetypes
import logging
# Regex which (hopefully) matches the WAYWT titles
waywt_title_pattern = re.compile('^WAYWT - [A-Z][a-z]+ [0-9]+[a-z]+$')
# Regex to extract URLs out of comment bodies
html_link_pattern = re.compile('a href=\"([^\"]+)\"')
def get_current_month_name():
"""
Returns the current month name as a string.
"""
return datetime.date.today().strftime("%B")
def get_urls_from_comment(comment):
"""
Returns a list of all URLs in a comment.
"""
return re.findall(html_link_pattern, comment.body_html)
def get_url_type(url):
"""
Tries to guess wether an URL points to an image.
"""
link_type, link_encoding = mimetypes.guess_type(url)
if link_type is None:
return "link"
return "image" if link_type.startswith("image/") else "link"
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
# Connect to reddit
reddit = praw.Reddit(user_agent='TopOfWAYWT Collector v0.1')
# Query to search threads with WAYWT and the current month name in their
# title
query = "title:WAYWT and title:{} and author:MFAModerator".format(get_current_month_name())
# Perform search on /r/malefashionadvice
posts = reddit.search(query, subreddit="malefashionadvice")
# This list will contain all the top comments
comments = []
# Go through each submission
for submission in posts:
# Get difference between submission date and today
date = datetime.date.fromtimestamp(int(submission.created_utc))
diff = date.today() - date
# Not submitted this month? Ignore.
if diff.days > 31:
continue
# Title doesn't match "WAYWT - Month Day"? Ignore.
if re.match(waywt_title_pattern, submission.title) is None:
continue
logging.info("Checking {} posted {} ago...".format(submission.title, diff.days))
# Check each comment of the submission
for comment in submission.comments:
if isinstance(comment, praw.objects.MoreComments):
continue
# That's what we're looking for
if comment.score >= 75:
comments.append(comment)
# Reddit says: Make no more than thirty requests per minute, so let's
# sleep for 2 seconds.
time.sleep(2)
logging.info("Found {} comments.".format(len(comments)))
# Sort comments
comments.sort(key=lambda comment: comment.score, reverse=True)
for rank, comment in enumerate(comments, 1):
urls = get_urls_from_comment(comment)
if not urls:
logging.warning("No URLs found in comment {}.".format(comment.permalink))
continue
# Print informations about the post: rank, permalink, author and score
print "{}. [Post]({}) by *{}* (+{}) ".format(rank, comment.permalink, comment.author, comment.score)
buckets = {
"link" : [],
"image" : [],
}
for url in get_urls_from_comment(comment):
buckets[get_url_type(url)].append(url)
# Print 4 spaces (actually only 3 because Python prints the 4th) to
# let MarkDown indent the current line on the list item level.
print " ",
# Print all links by their category
for key, values in buckets.items():
if not values:
continue
name = key.capitalize()
for index, url in enumerate(values, 1):
print "[{} {}]({})".format(name, index, url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment