Skip to content

Instantly share code, notes, and snippets.

@pcjonathan
Last active July 31, 2016 19:34
Show Gist options
  • Save pcjonathan/c16065ce38dbe4833044 to your computer and use it in GitHub Desktop.
Save pcjonathan/c16065ce38dbe4833044 to your computer and use it in GitHub Desktop.
Python 3 script for finding the top comments in subreddit(s). Supports Oauth and resumption
# Searches for top comments
# By /u/PCJonathan
# Adapted and enhanced from original at https://redd.it/3z31a6 (by /u/Pokechu22)
# This has only been tested using my configuration OAuth on detailed mode with multiprocess on so far
# There shouldn't be any problems, however there's no guarantee.
# If there's any problems, send me a message and I'll look into it
# Progress is given. Install the TQDM module for a decent progress bar and ETA support.
# Use of OAuth is highly recommended as this'll speed up the process dramatically. Instructions here:
# https://github.com/SmBe19/praw-OAuth2Util/blob/master/OAuth2Util/README.md
# This also writes all the data it has gathered to disk after collecting submissions and after each sub's comments
# Be aware that the file can get very large and constant rewrites are not fantastic for SSDs.
# You can turn this off using the RESUME variable.
import praw, time, os.path, pickle
from datetime import datetime, date, timedelta
### IF YOU CHANGE ANY OF THESE SETTINGS, DELETE DICT.DAT TO RESET STATUS.
# (Although some edits, e.g. adding a subreddit, extending the date,
SUBREDDIT = "Subreddit" #Multis are also supported, EG "modtalk+modsupport"
FROM = date(2015,1,1) #Year,Month,Day
TO = date(2016,1,1)
#Detail mode attempts to check everything. It is intended for very very busy subs and can take an age.
#It took an estimated 4.5-5 hours to scan /r/DoctorWho for 2015 (7340 posts) using OAuth.
# Disable to rely on reddit's top sorting instead
DETAIL_MODE = False
#Praw
MULTIPROCESS = False #If using multiprocess, set to true
OAUTH = False #If you're using oauth, set to true. Requires oauth.ini in same folder.
USERNAME = None #If logging in via cookie, enter username and password strings here
PASSWORD = None # e.g. "pcjonathan"
NUM_TOP_COMMENTS = 100 # How many comments to give results for
# How many comments to get per submission. The higher the number, the longer it may take.
# Max 500 (1500 if oauthed with a gold account or a moderator)
NUM_COMMENTS_PER_SUBMISSION = 500
# How many submissions to get for each search result.
# Max is about 1000; set to None to get the real max
NUM_SEARCH_RESULTS = None
# Turn resume saving on and off. Does not affect checking for an already existing file
RESUME = True
##### END CONFIG
filename = SUBREDDIT + ".dat"
def save_d():
if RESUME:
with open(filename, 'wb') as f:
pickle.dump(d, f)
try:
from tqdm import *
progress = True
except ImportError:
progress = False
if os.path.isfile(filename):
with open(filename,"rb") as f:
d = pickle.load(f)
else:
d = {}
#http://stackoverflow.com/questions/1060279/iterating-through-a-range-of-dates-in-python
def daterange(start_date, end_date):
for n in range(int((end_date - start_date).days)):
yield start_date + timedelta(n)
def add_posts_to_dict(post_list):
# Avoid duplicate posts
for post in post_list:
if post.id not in d:
d[post.id] = {"object" : post, "comments" : {}, "complete" : False}
useragent = "Top comment finder script, by /u/pcjonathan, modified from /u/pokechu22: https://redd.it/3z31a6"
if MULTIPROCESS:
from praw.handlers import MultiprocessHandler
r = praw.Reddit(useragent, handler=MultiprocessHandler())
else:
r = praw.Reddit(useragent)
if OAUTH:
import OAuth2Util
o = OAuth2Util.OAuth2Util(r)
o.refresh(force=True)
r.config.api_request_delay = 1.0
elif USERNAME is not None and PASSWORD is not None:
r.login(USERNAME,PASSWORD)
print("Logged in ", r.get_me().name)
query = "timestamp:{}..{}"
def get_posts(query, sort="top"):
posts = list(r.search(query, subreddit=SUBREDDIT, sort=sort, syntax="cloudsearch", limit=NUM_SEARCH_RESULTS))
add_posts_to_dict(posts)
def get_posts_for_date(single_date):
global from_timestamp, to_timestamp, query
from_timestamp = time.mktime(single_date.timetuple())
to_timestamp = time.mktime((single_date + timedelta(days=1)).timetuple())
q = query.format(int(from_timestamp), int(to_timestamp))
get_posts(q)
if SUBREDDIT == "All":
get_posts(q,sort="comments")
if DETAIL_MODE and len(d) == 0:
print("In Detail Mode\n")
if progress:
for single_date in tqdm(daterange(FROM, TO),desc="Grabbing each date's posts", total=(TO - FROM).days, unit="Days"):
get_posts_for_date(single_date)
else:
for single_date in daterange(FROM, TO):
get_posts_for_date(single_date)
print("Completed Date: ",single_date)
elif len(d) == 0:
print("NOT In Detail Mode")
# http://stackoverflow.com/q/9637838/3991344
from_timestamp = time.mktime(FROM.timetuple())
to_timestamp = time.mktime(TO.timetuple())
q = query.format(int(from_timestamp), int(to_timestamp))
get_posts(q)
get_posts(q,sort="comments")
save_d()
print("Analysing comments on {} posts...\n".format(len(d)))
all_comments = []
GRABBED = 0
def get_comments(s,sub):
global GRABBED
if not sub["complete"]:
submission = r.get_submission(submission_id=s, comment_limit=NUM_COMMENTS_PER_SUBMISSION, comment_sort="TOP")
if DETAIL_MODE:
submission.replace_more_comments(limit=None) # Get all comments
else:
submission.replace_more_comments(limit=0) # Get rid of morecomments
coms = praw.helpers.flatten_tree(submission.comments)
for com in coms:
sub["comments"][com.id] = com
sub["complete"] = True
GRABBED += 1
if (GRABBED % 20) == 0:
save_d()
all_comments.extend(sub["comments"].values())
if progress:
for s, sub in tqdm(d.items(),desc="Grabbing Comments from Posts", unit=" Posts"):
get_comments(s,sub)
else:
count = 0
for s, sub in d.items():
count += 1
get_comments(s,sub)
print("Completed submission {} of {}".format(count,len(d)))
print("Grabbed {} Submissions this session".format(GRABBED))
if GRABBED > 0:
save_d()
print("Retrieved {} comments in {} posts".format(len(all_comments),len(d)))
top_comments = sorted(all_comments, key=lambda comment: -comment.score)[:NUM_TOP_COMMENTS]
table = "Score|User|Post Title|Link\n-----|------|----------|---------------\n"
for comment in top_comments:
table += "%s|%s|%s|[link](%s)\n" % (comment.score, ("/u/" + comment.author.name) if comment.author else "[DELETED]", comment.submission.title if comment.submission else "[DELETED (!?)]", comment.permalink)
with open(SUBREDDIT + ".txt","w") as f:
f.write(table)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment