Skip to content

Instantly share code, notes, and snippets.

@gavin19
Last active July 16, 2023 15:07
Show Gist options
  • Save gavin19/8e2ed7547efcbb376e94f2057f951526 to your computer and use it in GitHub Desktop.
Save gavin19/8e2ed7547efcbb376e94f2057f951526 to your computer and use it in GitHub Desktop.
Screenshot top-level reddit comments
from selenium.webdriver import Firefox, FirefoxOptions
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import praw
from time import sleep
r = praw.Reddit(...)
opts = FirefoxOptions()
opts.add_argument("--headless")
opts.set_preference("dom.push.enabled", False) # kill notification popup
drv = Firefox(options=opts)
timeout = 10
def login():
drv.get("https://www.reddit.com/login")
user = drv.find_element(By.ID, "loginUsername")
user.send_keys("your_username")
pwd = drv.find_element(By.ID, "loginPassword")
pwd.send_keys("your_password")
btn = drv.find_element(By.CSS_SELECTOR, "button[type='submit']")
btn.click()
sleep(timeout)
cookie = drv.find_element(By.XPATH, '//button[text()="Accept all"]')
cookie.click() # kill cookie agreement popup. Probably not needed now
sleep(timeout)
login()
for post in r.subreddit("some_sub").hot(limit=1):
cmts = "https://www.reddit.com" + post.permalink
drv.get(cmts)
for comment in post.comments:
id = f"t1_{comment.id}"
try:
cmt = WebDriverWait(drv, timeout).until(
lambda x: x.find_element_by_id(id))
except TimeoutException:
print("Page load timed out...")
else:
cmt.screenshot(id + ".png")
@MichaelKhaykin
Copy link

Thank you for this! Huge insight: scraping the comments is significantly easier when logging in to reddit in the driver.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment