Last active
July 16, 2023 15:07
-
-
Save gavin19/8e2ed7547efcbb376e94f2057f951526 to your computer and use it in GitHub Desktop.
Screenshot top-level reddit comments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium.webdriver import Firefox, FirefoxOptions | |
from selenium.common.exceptions import TimeoutException | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
import praw | |
from time import sleep | |
r = praw.Reddit(...) | |
opts = FirefoxOptions() | |
opts.add_argument("--headless") | |
opts.set_preference("dom.push.enabled", False) # kill notification popup | |
drv = Firefox(options=opts) | |
timeout = 10 | |
def login(): | |
drv.get("https://www.reddit.com/login") | |
user = drv.find_element(By.ID, "loginUsername") | |
user.send_keys("your_username") | |
pwd = drv.find_element(By.ID, "loginPassword") | |
pwd.send_keys("your_password") | |
btn = drv.find_element(By.CSS_SELECTOR, "button[type='submit']") | |
btn.click() | |
sleep(timeout) | |
cookie = drv.find_element(By.XPATH, '//button[text()="Accept all"]') | |
cookie.click() # kill cookie agreement popup. Probably not needed now | |
sleep(timeout) | |
login() | |
for post in r.subreddit("some_sub").hot(limit=1): | |
cmts = "https://www.reddit.com" + post.permalink | |
drv.get(cmts) | |
for comment in post.comments: | |
id = f"t1_{comment.id}" | |
try: | |
cmt = WebDriverWait(drv, timeout).until( | |
lambda x: x.find_element_by_id(id)) | |
except TimeoutException: | |
print("Page load timed out...") | |
else: | |
cmt.screenshot(id + ".png") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you for this! Huge insight: scraping the comments is significantly easier when logging in to reddit in the driver.