Last active
July 31, 2016 19:34
-
-
Save pcjonathan/c16065ce38dbe4833044 to your computer and use it in GitHub Desktop.
Python 3 script for finding the top comments in subreddit(s). Supports Oauth and resumption
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Searches for top comments | |
# By /u/PCJonathan | |
# Adapted and enhanced from original at https://redd.it/3z31a6 (by /u/Pokechu22) | |
# This has only been tested using my configuration OAuth on detailed mode with multiprocess on so far | |
# There shouldn't be any problems, however there's no guarantee. | |
# If there's any problems, send me a message and I'll look into it | |
# Progress is given. Install the TQDM module for a decent progress bar and ETA support. | |
# Use of OAuth is highly recommended as this'll speed up the process dramatically. Instructions here: | |
# https://github.com/SmBe19/praw-OAuth2Util/blob/master/OAuth2Util/README.md | |
# This also writes all the data it has gathered to disk after collecting submissions and after each sub's comments | |
# Be aware that the file can get very large and constant rewrites are not fantastic for SSDs. | |
# You can turn this off using the RESUME variable. | |
import praw, time, os.path, pickle | |
from datetime import datetime, date, timedelta | |
### IF YOU CHANGE ANY OF THESE SETTINGS, DELETE DICT.DAT TO RESET STATUS. | |
# (Although some edits, e.g. adding a subreddit, extending the date, | |
SUBREDDIT = "Subreddit" #Multis are also supported, EG "modtalk+modsupport" | |
FROM = date(2015,1,1) #Year,Month,Day | |
TO = date(2016,1,1) | |
#Detail mode attempts to check everything. It is intended for very very busy subs and can take an age. | |
#It took an estimated 4.5-5 hours to scan /r/DoctorWho for 2015 (7340 posts) using OAuth. | |
# Disable to rely on reddit's top sorting instead | |
DETAIL_MODE = False | |
#Praw | |
MULTIPROCESS = False #If using multiprocess, set to true | |
OAUTH = False #If you're using oauth, set to true. Requires oauth.ini in same folder. | |
USERNAME = None #If logging in via cookie, enter username and password strings here | |
PASSWORD = None # e.g. "pcjonathan" | |
NUM_TOP_COMMENTS = 100 # How many comments to give results for | |
# How many comments to get per submission. The higher the number, the longer it may take. | |
# Max 500 (1500 if oauthed with a gold account or a moderator) | |
NUM_COMMENTS_PER_SUBMISSION = 500 | |
# How many submissions to get for each search result. | |
# Max is about 1000; set to None to get the real max | |
NUM_SEARCH_RESULTS = None | |
# Turn resume saving on and off. Does not affect checking for an already existing file | |
RESUME = True | |
##### END CONFIG | |
filename = SUBREDDIT + ".dat" | |
def save_d(): | |
if RESUME: | |
with open(filename, 'wb') as f: | |
pickle.dump(d, f) | |
try: | |
from tqdm import * | |
progress = True | |
except ImportError: | |
progress = False | |
if os.path.isfile(filename): | |
with open(filename,"rb") as f: | |
d = pickle.load(f) | |
else: | |
d = {} | |
#http://stackoverflow.com/questions/1060279/iterating-through-a-range-of-dates-in-python | |
def daterange(start_date, end_date): | |
for n in range(int((end_date - start_date).days)): | |
yield start_date + timedelta(n) | |
def add_posts_to_dict(post_list): | |
# Avoid duplicate posts | |
for post in post_list: | |
if post.id not in d: | |
d[post.id] = {"object" : post, "comments" : {}, "complete" : False} | |
useragent = "Top comment finder script, by /u/pcjonathan, modified from /u/pokechu22: https://redd.it/3z31a6" | |
if MULTIPROCESS: | |
from praw.handlers import MultiprocessHandler | |
r = praw.Reddit(useragent, handler=MultiprocessHandler()) | |
else: | |
r = praw.Reddit(useragent) | |
if OAUTH: | |
import OAuth2Util | |
o = OAuth2Util.OAuth2Util(r) | |
o.refresh(force=True) | |
r.config.api_request_delay = 1.0 | |
elif USERNAME is not None and PASSWORD is not None: | |
r.login(USERNAME,PASSWORD) | |
print("Logged in ", r.get_me().name) | |
query = "timestamp:{}..{}" | |
def get_posts(query, sort="top"): | |
posts = list(r.search(query, subreddit=SUBREDDIT, sort=sort, syntax="cloudsearch", limit=NUM_SEARCH_RESULTS)) | |
add_posts_to_dict(posts) | |
def get_posts_for_date(single_date): | |
global from_timestamp, to_timestamp, query | |
from_timestamp = time.mktime(single_date.timetuple()) | |
to_timestamp = time.mktime((single_date + timedelta(days=1)).timetuple()) | |
q = query.format(int(from_timestamp), int(to_timestamp)) | |
get_posts(q) | |
if SUBREDDIT == "All": | |
get_posts(q,sort="comments") | |
if DETAIL_MODE and len(d) == 0: | |
print("In Detail Mode\n") | |
if progress: | |
for single_date in tqdm(daterange(FROM, TO),desc="Grabbing each date's posts", total=(TO - FROM).days, unit="Days"): | |
get_posts_for_date(single_date) | |
else: | |
for single_date in daterange(FROM, TO): | |
get_posts_for_date(single_date) | |
print("Completed Date: ",single_date) | |
elif len(d) == 0: | |
print("NOT In Detail Mode") | |
# http://stackoverflow.com/q/9637838/3991344 | |
from_timestamp = time.mktime(FROM.timetuple()) | |
to_timestamp = time.mktime(TO.timetuple()) | |
q = query.format(int(from_timestamp), int(to_timestamp)) | |
get_posts(q) | |
get_posts(q,sort="comments") | |
save_d() | |
print("Analysing comments on {} posts...\n".format(len(d))) | |
all_comments = [] | |
GRABBED = 0 | |
def get_comments(s,sub): | |
global GRABBED | |
if not sub["complete"]: | |
submission = r.get_submission(submission_id=s, comment_limit=NUM_COMMENTS_PER_SUBMISSION, comment_sort="TOP") | |
if DETAIL_MODE: | |
submission.replace_more_comments(limit=None) # Get all comments | |
else: | |
submission.replace_more_comments(limit=0) # Get rid of morecomments | |
coms = praw.helpers.flatten_tree(submission.comments) | |
for com in coms: | |
sub["comments"][com.id] = com | |
sub["complete"] = True | |
GRABBED += 1 | |
if (GRABBED % 20) == 0: | |
save_d() | |
all_comments.extend(sub["comments"].values()) | |
if progress: | |
for s, sub in tqdm(d.items(),desc="Grabbing Comments from Posts", unit=" Posts"): | |
get_comments(s,sub) | |
else: | |
count = 0 | |
for s, sub in d.items(): | |
count += 1 | |
get_comments(s,sub) | |
print("Completed submission {} of {}".format(count,len(d))) | |
print("Grabbed {} Submissions this session".format(GRABBED)) | |
if GRABBED > 0: | |
save_d() | |
print("Retrieved {} comments in {} posts".format(len(all_comments),len(d))) | |
top_comments = sorted(all_comments, key=lambda comment: -comment.score)[:NUM_TOP_COMMENTS] | |
table = "Score|User|Post Title|Link\n-----|------|----------|---------------\n" | |
for comment in top_comments: | |
table += "%s|%s|%s|[link](%s)\n" % (comment.score, ("/u/" + comment.author.name) if comment.author else "[DELETED]", comment.submission.title if comment.submission else "[DELETED (!?)]", comment.permalink) | |
with open(SUBREDDIT + ".txt","w") as f: | |
f.write(table) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment