Skip to content

Instantly share code, notes, and snippets.

@JianLoong
Created November 7, 2022 22:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JianLoong/e8a92c7352e3b3276e17a060231e4432 to your computer and use it in GitHub Desktop.
Save JianLoong/e8a92c7352e3b3276e17a060231e4432 to your computer and use it in GitHub Desktop.
A simple Reddit crawler to obtain title and and top level comments.
import json
from concurrent.futures import as_completed
import requests
from requests_futures.sessions import FuturesSession
REDDIT_URL: str = "https://www.reddit.com/r/programming.json?limit=1"
def process():
# Create header to spoof browser
header = {
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
"Sec-Fetch-Dest": "document",
"Referer": "https://www.google.com/",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9"
}
r = requests.Session()
r.headers = header
try:
response = r.get(REDDIT_URL)
except:
exit(1)
# Get the results of the request - posts is a array of JSON.
# Notice here we are accessing the data -> children
posts = response.json()['data']['children']
results = []
urls = []
index = 1
for post in posts:
print(str(index) + " out of " + str(len(posts)))
index = index + 1
title = post['data']['title']
permalink = post['data']['permalink']
name = post['data']['name']
created = post['data']['created_utc']
selftext = post['data']['selftext']
result = {
"title": title,
"permalink": permalink,
"name": name,
"created": created,
"selftext": selftext
}
urls.append("https://www.reddit.com" + permalink + ".json")
replies = [];
submissions = []
with FuturesSession(max_workers=30) as session:
session.headers = header
futures = [session.get(url) for url in urls]
for future in as_completed(futures):
replies_response = future.result()
temp = replies_response.json()[0]["data"]["children"][0]["data"]["title"]
print(temp)
_replies_arr = replies_response.json()[1]
replies = []
for reply in _replies_arr['data']['children']:
_body = reply['data']['body']
replies.append(_body)
submission = {
"title": temp,
"reply": replies
}
submissions.append(submission)
print(json.dumps(submissions, indent=4))
if __name__ == "__main__":
process()
exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment