Last active
January 22, 2018 20:17
-
-
Save comma3/7f0489ae79907a09115762b894911ba5 to your computer and use it in GitHub Desktop.
Using praw (Python Reddit API Wrapper) to collect more than 1000 results from a query
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Reddit's API limits search results to 1000 for any query. If | |
# your query requires searching for more than 1000 results, | |
# you need a way to further limit the search space to find | |
# all of your posts. PRAW does not have an obvious solution, | |
# and I found few resources describing possible solutions. | |
# Some of the solutions on the internet are for old versions | |
# of praw and no longer work. | |
# The code below provides a solution by making a new query for each day. | |
# The interval can easily be shortened if there are possibly more than | |
# 1000 relevant posts in a single day. An exception will be raised if | |
# the script believes there are more than 1000 posts in a particular query. | |
# The specific implementation works with seasonal data, but can redily | |
# be changed to work over continuous time periods. | |
import sys, re, time | |
from datetime import datetime, timedelta | |
from dateutil import parser | |
import praw | |
def generate_dates(season_start, season_end, year, interval=86400): | |
""" | |
Generates a complete list of utc timestamps for dates between season_start | |
and season_end. Could be modified to skip days or to collect actual dates | |
that have games from a DB or the web. I prefer the exhaustive search for | |
now as it only needs to be completed once and is relatively fast. | |
INPUT: | |
season_start: String of date in MM/DD for the first day of the season. | |
season_end: String of date in MM/DD for the last day of the season. | |
year: int indicating the year of the start day. Dates may span years. | |
Only the initial date is required. | |
interval: Int indicating the width of query window. Defaults to 1 day but if | |
there are too many posts per day a smaller interval may be | |
necessary. Units = seconds | |
OUTPUT: | |
list of utc timestamps corresponding to every 4am EST in that period (does | |
not account for DST so the local time can change midseason.) | |
""" | |
# Check if we span new years | |
season_start += '/' + str(year) | |
if season_end.split('/')[0] < season_start.split('/')[0]: | |
season_end += '/' + str(year + 1) | |
else: | |
season_end += '/' + str(year) | |
# Stack overflow said this was the best method to convert to utc... | |
epoch = datetime.utcfromtimestamp(0) | |
# Starts at ~4am est. Could be changed to midnight any other time | |
# Skips to next day before yielding, so we move backwards to get to the correct time | |
next_date = (parser.parse(season_start)-epoch).total_seconds() - 57600 | |
stop_date = (parser.parse(season_end)-epoch).total_seconds() + 86400 # This makes the search inclusive | |
while next_date < stop_date: | |
next_date = next_date + interval | |
yield next_date | |
def get_submissions(praw_instance, start_time, stop_time=None, query='', subreddit='cfb'): | |
""" | |
Get up to the limit of subreddit submissions (1000) by date. | |
Defaults to search a single day | |
so usage is suggested to put a utc timestamp for midnight (or whatever low activity point you desire). | |
INPUT: | |
praw_instance = a properly initialize praw instance | |
start_time = int or float of date where the seach should begin in | |
utc timestamp | |
stop_time = defaults to a timedelta of +1 day, otherwise, should be a utc timestamp | |
indicating when to stop. | |
total_list = list of posts that are found by the method | |
subreddit = string respresenting subreddit to search (easy modification for | |
other sports). | |
query = string for reddit search (reddit search is very unreliable, so we | |
leave empty and use python to check titles ourselves). | |
OUTPUT: | |
List of strings of thread ids. | |
""" | |
if not stop_time: | |
# Making an assumption that there won't be 1000 posts in a single day | |
# Reddit search limits results to 1000 for anything | |
stop_time = start_time + 86400 # add a day | |
game_threads = [] | |
i = 0 # Can't use enumerate on generator. If >=1000, may have missed some results. | |
# Iterate over the search results. | |
for thread in praw_instance.subreddit(subreddit).submissions(start_time, stop_time, query): | |
# Do whatever you want with the thread results. | |
i += 1 | |
title = thread.title.lower() | |
is_postgame = '[post game thread]' in title or '[postgame thread]' in title | |
is_game = '[game thread]' in title | |
if is_postgame or is_game: | |
game_threads.append(thread) | |
print('Total threads on this date: ', i) | |
if i > 999: | |
# Want to raise an exception here so we don't miss any data | |
raise ValueError("Exceeded search limits!") | |
return game_threads | |
if __name__ == '__main__': | |
subreddit = 'cfb' | |
season_start ='8/15' # 'Month/Day' requires / | |
season_end = '1/20' | |
first_year = 2014 # Parser should accept both 20XX and XX | |
last_year = 2017 | |
bot_params = 'bot1' # These are collected from praw.ini | |
reddit = praw.Reddit(bot_params) # Create bot instance | |
for year in range(first_year, last_year+1): | |
print('==========START {}============='.format(year)) | |
game_threads = [] | |
for date in generate_dates(season_start, season_end, year): | |
#print(date) # UTC | |
print(datetime.fromtimestamp(date).strftime('%Y-%m-%d %H:%M:%S')) # Human readable | |
# preforms the task at hand | |
game_threads.extend(get_submissions(reddit, date)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment