pcjonathan/top_reddit_comments_table.py

## top_reddit_comments_table.py
# Searches for top comments
# By /u/PCJonathan
# Adapted and enhanced from original at https://redd.it/3z31a6 (by /u/Pokechu22)

# This has only been tested using my configuration OAuth on detailed mode with multiprocess on so far
# There shouldn't be any problems, however there's no guarantee.
# If there's any problems, send me a message and I'll look into it
# Progress is given. Install the TQDM module for a decent progress bar and ETA support.
# Use of OAuth is highly recommended as this'll speed up the process dramatically. Instructions here:
# https://github.com/SmBe19/praw-OAuth2Util/blob/master/OAuth2Util/README.md
# This also writes all the data it has gathered to disk after collecting submissions and after each sub's comments
# Be aware that the file can get very large and constant rewrites are not fantastic for SSDs.
# You can turn this off using the RESUME variable.


import praw, time, os.path, pickle
from datetime import datetime, date, timedelta

### IF YOU CHANGE ANY OF THESE SETTINGS, DELETE DICT.DAT TO RESET STATUS.
# (Although some edits, e.g. adding a subreddit, extending the date,

SUBREDDIT = "Subreddit" #Multis are also supported, EG "modtalk+modsupport"

FROM = date(2015,1,1) #Year,Month,Day
TO = date(2016,1,1)

#Detail mode attempts to check everything. It is intended for very very busy subs and can take an age.
#It took an estimated 4.5-5 hours to scan /r/DoctorWho for 2015 (7340 posts) using OAuth.
# Disable to rely on reddit's top sorting instead
DETAIL_MODE = False

#Praw

MULTIPROCESS = False     #If using multiprocess, set to true
OAUTH = False            #If you're using oauth, set to true. Requires oauth.ini in same folder.
USERNAME = None         #If logging in via cookie, enter username and password strings here
PASSWORD = None         # e.g. "pcjonathan"


NUM_TOP_COMMENTS = 100 # How many comments to give results for

# How many comments to get per submission.  The higher the number, the longer it may take.
# Max 500 (1500 if oauthed with a gold account or a moderator)
NUM_COMMENTS_PER_SUBMISSION = 500
# How many submissions to get for each search result.
# Max is about 1000; set to None to get the real max
NUM_SEARCH_RESULTS = None

# Turn resume saving on and off. Does not affect checking for an already existing file
RESUME = True

##### END CONFIG

filename = SUBREDDIT + ".dat"

def save_d():
    if RESUME:
        with open(filename, 'wb') as f:
            pickle.dump(d, f)

try:
    from tqdm import *
    progress = True
except ImportError:
    progress = False

if os.path.isfile(filename):
    with open(filename,"rb") as f:
        d = pickle.load(f)
else:
        d = {}


#http://stackoverflow.com/questions/1060279/iterating-through-a-range-of-dates-in-python
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

def add_posts_to_dict(post_list):
    # Avoid duplicate posts
    for post in post_list:
        if post.id not in d:
            d[post.id] = {"object" : post, "comments" : {}, "complete" : False}

useragent = "Top comment finder script, by /u/pcjonathan, modified from /u/pokechu22: https://redd.it/3z31a6"

if MULTIPROCESS:
    from praw.handlers import MultiprocessHandler
    r = praw.Reddit(useragent, handler=MultiprocessHandler())
else:
    r = praw.Reddit(useragent)

if OAUTH:
    import OAuth2Util
    o = OAuth2Util.OAuth2Util(r)
    o.refresh(force=True)
    r.config.api_request_delay = 1.0
elif USERNAME is not None and PASSWORD is not None:
    r.login(USERNAME,PASSWORD)
print("Logged in ", r.get_me().name)

query = "timestamp:{}..{}"


def get_posts(query, sort="top"):
    posts = list(r.search(query, subreddit=SUBREDDIT, sort=sort, syntax="cloudsearch", limit=NUM_SEARCH_RESULTS))
    add_posts_to_dict(posts)


def get_posts_for_date(single_date):
    global from_timestamp, to_timestamp, query
    from_timestamp = time.mktime(single_date.timetuple())
    to_timestamp = time.mktime((single_date + timedelta(days=1)).timetuple())
    q = query.format(int(from_timestamp), int(to_timestamp))
    get_posts(q)
    if SUBREDDIT == "All":
        get_posts(q,sort="comments")


if DETAIL_MODE and len(d) == 0:
    print("In Detail Mode\n")
    if progress:
        for single_date in tqdm(daterange(FROM, TO),desc="Grabbing each date's posts", total=(TO - FROM).days, unit="Days"):
            get_posts_for_date(single_date)
    else:
        for single_date in daterange(FROM, TO):
            get_posts_for_date(single_date)
            print("Completed Date: ",single_date)
elif len(d) == 0:
    print("NOT In Detail Mode")
    # http://stackoverflow.com/q/9637838/3991344
    from_timestamp = time.mktime(FROM.timetuple())
    to_timestamp = time.mktime(TO.timetuple())
    q = query.format(int(from_timestamp), int(to_timestamp))
    get_posts(q)
    get_posts(q,sort="comments")

save_d()

print("Analysing comments on {} posts...\n".format(len(d)))
all_comments = []

GRABBED = 0

def get_comments(s,sub):
    global GRABBED
    if not sub["complete"]:
        submission = r.get_submission(submission_id=s, comment_limit=NUM_COMMENTS_PER_SUBMISSION, comment_sort="TOP")
        if DETAIL_MODE:
            submission.replace_more_comments(limit=None)  # Get all comments
        else:
            submission.replace_more_comments(limit=0)  # Get rid of morecomments
        coms = praw.helpers.flatten_tree(submission.comments)
        for com in coms:
            sub["comments"][com.id] = com
        sub["complete"] = True
        GRABBED += 1
        if (GRABBED % 20) == 0:
            save_d()
    all_comments.extend(sub["comments"].values())

if progress:
    for s, sub in tqdm(d.items(),desc="Grabbing Comments from Posts", unit=" Posts"):
        get_comments(s,sub)
else:
    count = 0
    for s, sub in d.items():
        count += 1
        get_comments(s,sub)
        print("Completed submission {} of {}".format(count,len(d)))

print("Grabbed {} Submissions this session".format(GRABBED))
if GRABBED > 0:
    save_d()
print("Retrieved {} comments in {} posts".format(len(all_comments),len(d)))
top_comments = sorted(all_comments, key=lambda comment: -comment.score)[:NUM_TOP_COMMENTS]

table = "Score|User|Post Title|Link\n-----|------|----------|---------------\n"

for comment in top_comments:
    table += "%s|%s|%s|[link](%s)\n" % (comment.score, ("/u/" + comment.author.name) if comment.author else "[DELETED]", comment.submission.title if comment.submission else "[DELETED (!?)]", comment.permalink)


with open(SUBREDDIT + ".txt","w") as f:
    f.write(table)
	# Searches for top comments
	# By /u/PCJonathan
	# Adapted and enhanced from original at https://redd.it/3z31a6 (by /u/Pokechu22)

	# This has only been tested using my configuration OAuth on detailed mode with multiprocess on so far
	# There shouldn't be any problems, however there's no guarantee.
	# If there's any problems, send me a message and I'll look into it
	# Progress is given. Install the TQDM module for a decent progress bar and ETA support.
	# Use of OAuth is highly recommended as this'll speed up the process dramatically. Instructions here:
	# https://github.com/SmBe19/praw-OAuth2Util/blob/master/OAuth2Util/README.md
	# This also writes all the data it has gathered to disk after collecting submissions and after each sub's comments
	# Be aware that the file can get very large and constant rewrites are not fantastic for SSDs.
	# You can turn this off using the RESUME variable.


	import praw, time, os.path, pickle
	from datetime import datetime, date, timedelta

	### IF YOU CHANGE ANY OF THESE SETTINGS, DELETE DICT.DAT TO RESET STATUS.
	# (Although some edits, e.g. adding a subreddit, extending the date,

	SUBREDDIT = "Subreddit" #Multis are also supported, EG "modtalk+modsupport"

	FROM = date(2015,1,1) #Year,Month,Day
	TO = date(2016,1,1)

	#Detail mode attempts to check everything. It is intended for very very busy subs and can take an age.
	#It took an estimated 4.5-5 hours to scan /r/DoctorWho for 2015 (7340 posts) using OAuth.
	# Disable to rely on reddit's top sorting instead
	DETAIL_MODE = False

	#Praw

	MULTIPROCESS = False #If using multiprocess, set to true
	OAUTH = False #If you're using oauth, set to true. Requires oauth.ini in same folder.
	USERNAME = None #If logging in via cookie, enter username and password strings here
	PASSWORD = None # e.g. "pcjonathan"


	NUM_TOP_COMMENTS = 100 # How many comments to give results for

	# How many comments to get per submission. The higher the number, the longer it may take.
	# Max 500 (1500 if oauthed with a gold account or a moderator)
	NUM_COMMENTS_PER_SUBMISSION = 500
	# How many submissions to get for each search result.
	# Max is about 1000; set to None to get the real max
	NUM_SEARCH_RESULTS = None

	# Turn resume saving on and off. Does not affect checking for an already existing file
	RESUME = True

	##### END CONFIG

	filename = SUBREDDIT + ".dat"

	def save_d():
	if RESUME:
	with open(filename, 'wb') as f:
	pickle.dump(d, f)

	try:
	from tqdm import *
	progress = True
	except ImportError:
	progress = False

	if os.path.isfile(filename):
	with open(filename,"rb") as f:
	d = pickle.load(f)
	else:
	d = {}


	#http://stackoverflow.com/questions/1060279/iterating-through-a-range-of-dates-in-python
	def daterange(start_date, end_date):
	for n in range(int((end_date - start_date).days)):
	yield start_date + timedelta(n)

	def add_posts_to_dict(post_list):
	# Avoid duplicate posts
	for post in post_list:
	if post.id not in d:
	d[post.id] = {"object" : post, "comments" : {}, "complete" : False}

	useragent = "Top comment finder script, by /u/pcjonathan, modified from /u/pokechu22: https://redd.it/3z31a6"

	if MULTIPROCESS:
	from praw.handlers import MultiprocessHandler
	r = praw.Reddit(useragent, handler=MultiprocessHandler())
	else:
	r = praw.Reddit(useragent)

	if OAUTH:
	import OAuth2Util
	o = OAuth2Util.OAuth2Util(r)
	o.refresh(force=True)
	r.config.api_request_delay = 1.0
	elif USERNAME is not None and PASSWORD is not None:
	r.login(USERNAME,PASSWORD)
	print("Logged in ", r.get_me().name)

	query = "timestamp:{}..{}"


	def get_posts(query, sort="top"):
	posts = list(r.search(query, subreddit=SUBREDDIT, sort=sort, syntax="cloudsearch", limit=NUM_SEARCH_RESULTS))
	add_posts_to_dict(posts)


	def get_posts_for_date(single_date):
	global from_timestamp, to_timestamp, query
	from_timestamp = time.mktime(single_date.timetuple())
	to_timestamp = time.mktime((single_date + timedelta(days=1)).timetuple())
	q = query.format(int(from_timestamp), int(to_timestamp))
	get_posts(q)
	if SUBREDDIT == "All":
	get_posts(q,sort="comments")


	if DETAIL_MODE and len(d) == 0:
	print("In Detail Mode\n")
	if progress:
	for single_date in tqdm(daterange(FROM, TO),desc="Grabbing each date's posts", total=(TO - FROM).days, unit="Days"):
	get_posts_for_date(single_date)
	else:
	for single_date in daterange(FROM, TO):
	get_posts_for_date(single_date)
	print("Completed Date: ",single_date)
	elif len(d) == 0:
	print("NOT In Detail Mode")
	# http://stackoverflow.com/q/9637838/3991344
	from_timestamp = time.mktime(FROM.timetuple())
	to_timestamp = time.mktime(TO.timetuple())
	q = query.format(int(from_timestamp), int(to_timestamp))
	get_posts(q)
	get_posts(q,sort="comments")

	save_d()

	print("Analysing comments on {} posts...\n".format(len(d)))
	all_comments = []

	GRABBED = 0

	def get_comments(s,sub):
	global GRABBED
	if not sub["complete"]:
	submission = r.get_submission(submission_id=s, comment_limit=NUM_COMMENTS_PER_SUBMISSION, comment_sort="TOP")
	if DETAIL_MODE:
	submission.replace_more_comments(limit=None) # Get all comments
	else:
	submission.replace_more_comments(limit=0) # Get rid of morecomments
	coms = praw.helpers.flatten_tree(submission.comments)
	for com in coms:
	sub["comments"][com.id] = com
	sub["complete"] = True
	GRABBED += 1
	if (GRABBED % 20) == 0:
	save_d()
	all_comments.extend(sub["comments"].values())

	if progress:
	for s, sub in tqdm(d.items(),desc="Grabbing Comments from Posts", unit=" Posts"):
	get_comments(s,sub)
	else:
	count = 0
	for s, sub in d.items():
	count += 1
	get_comments(s,sub)
	print("Completed submission {} of {}".format(count,len(d)))

	print("Grabbed {} Submissions this session".format(GRABBED))
	if GRABBED > 0:
	save_d()
	print("Retrieved {} comments in {} posts".format(len(all_comments),len(d)))
	top_comments = sorted(all_comments, key=lambda comment: -comment.score)[:NUM_TOP_COMMENTS]

	table = "Score\|User\|Post Title\|Link\n-----\|------\|----------\|---------------\n"

	for comment in top_comments:
	table += "%s\|%s\|%s\|[link](%s)\n" % (comment.score, ("/u/" + comment.author.name) if comment.author else "[DELETED]", comment.submission.title if comment.submission else "[DELETED (!?)]", comment.permalink)


	with open(SUBREDDIT + ".txt","w") as f:
	f.write(table)