Created
November 7, 2022 22:02
-
-
Save JianLoong/e8a92c7352e3b3276e17a060231e4432 to your computer and use it in GitHub Desktop.
A simple Reddit crawler to obtain title and and top level comments.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from concurrent.futures import as_completed | |
import requests | |
from requests_futures.sessions import FuturesSession | |
REDDIT_URL: str = "https://www.reddit.com/r/programming.json?limit=1" | |
def process(): | |
# Create header to spoof browser | |
header = { | |
"Connection": "keep-alive", | |
"Upgrade-Insecure-Requests": "1", | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", | |
"Sec-Fetch-Site": "same-origin", | |
"Sec-Fetch-Mode": "navigate", | |
"Sec-Fetch-User": "?1", | |
"Sec-Fetch-Dest": "document", | |
"Referer": "https://www.google.com/", | |
"Accept-Encoding": "gzip, deflate, br", | |
"Accept-Language": "en-US,en;q=0.9" | |
} | |
r = requests.Session() | |
r.headers = header | |
try: | |
response = r.get(REDDIT_URL) | |
except: | |
exit(1) | |
# Get the results of the request - posts is a array of JSON. | |
# Notice here we are accessing the data -> children | |
posts = response.json()['data']['children'] | |
results = [] | |
urls = [] | |
index = 1 | |
for post in posts: | |
print(str(index) + " out of " + str(len(posts))) | |
index = index + 1 | |
title = post['data']['title'] | |
permalink = post['data']['permalink'] | |
name = post['data']['name'] | |
created = post['data']['created_utc'] | |
selftext = post['data']['selftext'] | |
result = { | |
"title": title, | |
"permalink": permalink, | |
"name": name, | |
"created": created, | |
"selftext": selftext | |
} | |
urls.append("https://www.reddit.com" + permalink + ".json") | |
replies = []; | |
submissions = [] | |
with FuturesSession(max_workers=30) as session: | |
session.headers = header | |
futures = [session.get(url) for url in urls] | |
for future in as_completed(futures): | |
replies_response = future.result() | |
temp = replies_response.json()[0]["data"]["children"][0]["data"]["title"] | |
print(temp) | |
_replies_arr = replies_response.json()[1] | |
replies = [] | |
for reply in _replies_arr['data']['children']: | |
_body = reply['data']['body'] | |
replies.append(_body) | |
submission = { | |
"title": temp, | |
"reply": replies | |
} | |
submissions.append(submission) | |
print(json.dumps(submissions, indent=4)) | |
if __name__ == "__main__": | |
process() | |
exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment