Skip to content

Instantly share code, notes, and snippets.

@duhaime
Created May 27, 2020 14:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save duhaime/102aacbacb0b04a30547e4bdc88a0048 to your computer and use it in GitHub Desktop.
Save duhaime/102aacbacb0b04a30547e4bdc88a0048 to your computer and use it in GitHub Desktop.
Fetch Reddit Pushshift
from dateutil.relativedelta import relativedelta
from datetime import date, datetime
import json
import requests
import calendar
import time
import sys
end = date.today()
start = end - relativedelta(years=3)
# convert to utc seconds
end = calendar.timegm(end.timetuple())
start = calendar.timegm(start.timetuple())
n = 500
delta = (end-start)//n
subreddit = 'amitheasshole'
for i in range(n):
while True: # handle 502
url = 'https://api.pushshift.io/reddit/submission/search/?after={}&before={}&sort_type=score&sort=desc&subreddit={}'.format(
(i*delta) + start,
((i+1)*delta) + start,
subreddit,
)
r = requests.get(url)
j = r.json()
with open('{}-{}.json'.format(subreddit, i), 'w') as out:
json.dump(j, out)
print(i, n, url)
time.sleep(2)
if 'data' in j:
print(len(j['data']))
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment