Created
May 20, 2013 19:14
-
-
Save anonymous/5614692 to your computer and use it in GitHub Desktop.
Top of WAYWT Collector
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import praw | |
import datetime | |
import time | |
import re | |
import mimetypes | |
import logging | |
# Regex which (hopefully) matches the WAYWT titles | |
waywt_title_pattern = re.compile('^WAYWT - [A-Z][a-z]+ [0-9]+[a-z]+$') | |
# Regex to extract URLs out of comment bodies | |
html_link_pattern = re.compile('a href=\"([^\"]+)\"') | |
def get_current_month_name(): | |
""" | |
Returns the current month name as a string. | |
""" | |
return datetime.date.today().strftime("%B") | |
def get_urls_from_comment(comment): | |
""" | |
Returns a list of all URLs in a comment. | |
""" | |
return re.findall(html_link_pattern, comment.body_html) | |
def get_url_type(url): | |
""" | |
Tries to guess wether an URL points to an image. | |
""" | |
link_type, link_encoding = mimetypes.guess_type(url) | |
if link_type is None: | |
return "link" | |
return "image" if link_type.startswith("image/") else "link" | |
if __name__ == "__main__": | |
logging.basicConfig(level=logging.INFO) | |
# Connect to reddit | |
reddit = praw.Reddit(user_agent='TopOfWAYWT Collector v0.1') | |
# Query to search threads with WAYWT and the current month name in their | |
# title | |
query = "title:WAYWT and title:{} and author:MFAModerator".format(get_current_month_name()) | |
# Perform search on /r/malefashionadvice | |
posts = reddit.search(query, subreddit="malefashionadvice") | |
# This list will contain all the top comments | |
comments = [] | |
# Go through each submission | |
for submission in posts: | |
# Get difference between submission date and today | |
date = datetime.date.fromtimestamp(int(submission.created_utc)) | |
diff = date.today() - date | |
# Not submitted this month? Ignore. | |
if diff.days > 31: | |
continue | |
# Title doesn't match "WAYWT - Month Day"? Ignore. | |
if re.match(waywt_title_pattern, submission.title) is None: | |
continue | |
logging.info("Checking {} posted {} ago...".format(submission.title, diff.days)) | |
# Check each comment of the submission | |
for comment in submission.comments: | |
if isinstance(comment, praw.objects.MoreComments): | |
continue | |
# That's what we're looking for | |
if comment.score >= 75: | |
comments.append(comment) | |
# Reddit says: Make no more than thirty requests per minute, so let's | |
# sleep for 2 seconds. | |
time.sleep(2) | |
logging.info("Found {} comments.".format(len(comments))) | |
# Sort comments | |
comments.sort(key=lambda comment: comment.score, reverse=True) | |
for rank, comment in enumerate(comments, 1): | |
urls = get_urls_from_comment(comment) | |
if not urls: | |
logging.warning("No URLs found in comment {}.".format(comment.permalink)) | |
continue | |
# Print informations about the post: rank, permalink, author and score | |
print "{}. [Post]({}) by *{}* (+{}) ".format(rank, comment.permalink, comment.author, comment.score) | |
buckets = { | |
"link" : [], | |
"image" : [], | |
} | |
for url in get_urls_from_comment(comment): | |
buckets[get_url_type(url)].append(url) | |
# Print 4 spaces (actually only 3 because Python prints the 4th) to | |
# let MarkDown indent the current line on the list item level. | |
print " ", | |
# Print all links by their category | |
for key, values in buckets.items(): | |
if not values: | |
continue | |
name = key.capitalize() | |
for index, url in enumerate(values, 1): | |
print "[{} {}]({})".format(name, index, url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment