ameerkat/get_submissions.py

## get_submissions.py
# Adapted from https://www.osrsbox.com/blog/2019/03/18/watercooler-scraping-an-entire-subreddit-2007scape/

import requests
import json
import re
import time
import os

SUBREDDIT = "movies"
PUSHSHIFT_REDDIT_URL = "http://api.pushshift.io/reddit"
DATA_FOLDER = "./r-" + SUBREDDIT
AFTER_EPOCH = 1546300800 # 01/01/2019 @ 12:00am (UTC), see https://www.epochconverter.com/

def fetchObjects(**kwargs):
    # Default paramaters for API query
    params = {
        "sort_type":"created_utc",
        "sort":"asc",
        "size":1000
    }

    # Add additional paramters based on function arguments
    for key,value in kwargs.items():
        params[key] = value

    # Set the type variable based on function input
    # The type can be "comment" or "submission", default is "comment"
    type = "comment"
    if 'type' in kwargs and kwargs['type'].lower() == "submission":
        type = "submission"

    # Perform an API request
    r = requests.get(PUSHSHIFT_REDDIT_URL + "/" + type + "/search/", params=params, timeout=30)

    # Check the status code, if successful, process the data
    if r.status_code == 200:
        response = json.loads(r.text)
        data = response['data']
        sorted_data_by_id = sorted(data, key=lambda x: int(x['id'],36))
        return sorted_data_by_id

def extract_reddit_data(**kwargs):
    # Specify the start timestamp
    max_created_utc = AFTER_EPOCH
    max_id = 0

    # Open a file for JSON output
    file = open(os.path.join(DATA_FOLDER, "submissions.json"),"a")

    # While loop for recursive function
    while True:
        nothing_processed = True
        # Call the recursive function
        objects = fetchObjects(**kwargs,after=max_created_utc)

        # Loop the returned data, ordered by date
        for object in objects:
            id = int(object['id'],36)
            if id > max_id:
                nothing_processed = False
                created_utc = object['created_utc']
                max_id = id
                if created_utc > max_created_utc: max_created_utc = created_utc
                # Output JSON data to the opened file
                print(json.dumps(object,sort_keys=True,ensure_ascii=True),file=file)

        # Exit if nothing happened
        if nothing_processed: return
        max_created_utc -= 1

        # Sleep a little before the next recursive function call
        time.sleep(.5)

os.makedirs(DATA_FOLDER, exist_ok=True)
extract_reddit_data(subreddit=SUBREDDIT,type="submission")
	# Adapted from https://www.osrsbox.com/blog/2019/03/18/watercooler-scraping-an-entire-subreddit-2007scape/

	import requests
	import json
	import re
	import time
	import os

	SUBREDDIT = "movies"
	PUSHSHIFT_REDDIT_URL = "http://api.pushshift.io/reddit"
	DATA_FOLDER = "./r-" + SUBREDDIT
	AFTER_EPOCH = 1546300800 # 01/01/2019 @ 12:00am (UTC), see https://www.epochconverter.com/

	def fetchObjects(**kwargs):
	# Default paramaters for API query
	params = {
	"sort_type":"created_utc",
	"sort":"asc",
	"size":1000
	}

	# Add additional paramters based on function arguments
	for key,value in kwargs.items():
	params[key] = value

	# Set the type variable based on function input
	# The type can be "comment" or "submission", default is "comment"
	type = "comment"
	if 'type' in kwargs and kwargs['type'].lower() == "submission":
	type = "submission"

	# Perform an API request
	r = requests.get(PUSHSHIFT_REDDIT_URL + "/" + type + "/search/", params=params, timeout=30)

	# Check the status code, if successful, process the data
	if r.status_code == 200:
	response = json.loads(r.text)
	data = response['data']
	sorted_data_by_id = sorted(data, key=lambda x: int(x['id'],36))
	return sorted_data_by_id

	def extract_reddit_data(**kwargs):
	# Specify the start timestamp
	max_created_utc = AFTER_EPOCH
	max_id = 0

	# Open a file for JSON output
	file = open(os.path.join(DATA_FOLDER, "submissions.json"),"a")

	# While loop for recursive function
	while True:
	nothing_processed = True
	# Call the recursive function
	objects = fetchObjects(**kwargs,after=max_created_utc)

	# Loop the returned data, ordered by date
	for object in objects:
	id = int(object['id'],36)
	if id > max_id:
	nothing_processed = False
	created_utc = object['created_utc']
	max_id = id
	if created_utc > max_created_utc: max_created_utc = created_utc
	# Output JSON data to the opened file
	print(json.dumps(object,sort_keys=True,ensure_ascii=True),file=file)

	# Exit if nothing happened
	if nothing_processed: return
	max_created_utc -= 1

	# Sleep a little before the next recursive function call
	time.sleep(.5)

	os.makedirs(DATA_FOLDER, exist_ok=True)
	extract_reddit_data(subreddit=SUBREDDIT,type="submission")