Created
April 8, 2021 17:55
-
-
Save eric-chapdelaine/7d2e69bda848c0246d45d8b5b7cb1ba6 to your computer and use it in GitHub Desktop.
Reddit Scrapper to Get All Text Between Dates
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import praw | |
from psaw import PushshiftAPI | |
import datetime as dt | |
api = PushshiftAPI() | |
posts_per_call = 100 | |
# 26352000 seconds in range | |
seconds_between_calls = 26352000/posts_per_call | |
# class of 2020: Aug 2019 to June 2020 | |
# class of 2021: Aug 2020 to June 2021 | |
start_epoch = int(dt.datetime(2019, 8, 1).timestamp()) | |
end_epoch = int(dt.datetime(2020, 6, 1).timestamp()) | |
sub = 'sat' | |
f = open(sub + ".txt", "w") | |
print("Started work on " + sub) | |
while start_epoch < end_epoch: | |
currentEnd = start_epoch + seconds_between_calls | |
results = api.search_submissions(after = int(start_epoch), | |
before = int(currentEnd), | |
subreddit=sub, | |
filter=['selftext'], | |
limit=posts_per_call) | |
start_epoch += seconds_between_calls | |
for i in list(results): | |
try: | |
body = i.d_['selftext'] | |
if (body != "[deleted]" and body != "[removed]"): | |
f.write(body) | |
except: | |
print("Error: moving on") | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment