Skip to content

Instantly share code, notes, and snippets.

@ameerkat
Created October 29, 2020 21:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ameerkat/ccbc9077b2bae8ff8f9d77790d74a149 to your computer and use it in GitHub Desktop.
Save ameerkat/ccbc9077b2bae8ff8f9d77790d74a149 to your computer and use it in GitHub Desktop.
# Adapted from https://www.osrsbox.com/blog/2019/03/18/watercooler-scraping-an-entire-subreddit-2007scape/
import requests
import json
import re
import time
import os
SUBREDDIT = "movies"
PUSHSHIFT_REDDIT_URL = "http://api.pushshift.io/reddit"
DATA_FOLDER = "./r-" + SUBREDDIT
AFTER_EPOCH = 1546300800 # 01/01/2019 @ 12:00am (UTC), see https://www.epochconverter.com/
def fetchObjects(**kwargs):
# Default paramaters for API query
params = {
"sort_type":"created_utc",
"sort":"asc",
"size":1000
}
# Add additional paramters based on function arguments
for key,value in kwargs.items():
params[key] = value
# Set the type variable based on function input
# The type can be "comment" or "submission", default is "comment"
type = "comment"
if 'type' in kwargs and kwargs['type'].lower() == "submission":
type = "submission"
# Perform an API request
r = requests.get(PUSHSHIFT_REDDIT_URL + "/" + type + "/search/", params=params, timeout=30)
# Check the status code, if successful, process the data
if r.status_code == 200:
response = json.loads(r.text)
data = response['data']
sorted_data_by_id = sorted(data, key=lambda x: int(x['id'],36))
return sorted_data_by_id
def extract_reddit_data(**kwargs):
# Specify the start timestamp
max_created_utc = AFTER_EPOCH
max_id = 0
# Open a file for JSON output
file = open(os.path.join(DATA_FOLDER, "submissions.json"),"a")
# While loop for recursive function
while True:
nothing_processed = True
# Call the recursive function
objects = fetchObjects(**kwargs,after=max_created_utc)
# Loop the returned data, ordered by date
for object in objects:
id = int(object['id'],36)
if id > max_id:
nothing_processed = False
created_utc = object['created_utc']
max_id = id
if created_utc > max_created_utc: max_created_utc = created_utc
# Output JSON data to the opened file
print(json.dumps(object,sort_keys=True,ensure_ascii=True),file=file)
# Exit if nothing happened
if nothing_processed: return
max_created_utc -= 1
# Sleep a little before the next recursive function call
time.sleep(.5)
os.makedirs(DATA_FOLDER, exist_ok=True)
extract_reddit_data(subreddit=SUBREDDIT,type="submission")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment