Last active
July 16, 2023 15:07
-
-
Save gavin19/8e2ed7547efcbb376e94f2057f951526 to your computer and use it in GitHub Desktop.
Screenshot top-level reddit comments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium.webdriver import Firefox, FirefoxOptions | |
from selenium.common.exceptions import TimeoutException | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
import praw | |
from time import sleep | |
r = praw.Reddit(...) | |
opts = FirefoxOptions() | |
opts.add_argument("--headless") | |
opts.set_preference("dom.push.enabled", False) # kill notification popup | |
drv = Firefox(options=opts) | |
timeout = 10 | |
def login(): | |
drv.get("https://www.reddit.com/login") | |
user = drv.find_element(By.ID, "loginUsername") | |
user.send_keys("your_username") | |
pwd = drv.find_element(By.ID, "loginPassword") | |
pwd.send_keys("your_password") | |
btn = drv.find_element(By.CSS_SELECTOR, "button[type='submit']") | |
btn.click() | |
sleep(timeout) | |
cookie = drv.find_element(By.XPATH, '//button[text()="Accept all"]') | |
cookie.click() # kill cookie agreement popup. Probably not needed now | |
sleep(timeout) | |
login() | |
for post in r.subreddit("some_sub").hot(limit=1): | |
cmts = "https://www.reddit.com" + post.permalink | |
drv.get(cmts) | |
for comment in post.comments: | |
id = f"t1_{comment.id}" | |
try: | |
cmt = WebDriverWait(drv, timeout).until( | |
lambda x: x.find_element_by_id(id)) | |
except TimeoutException: | |
print("Page load timed out...") | |
else: | |
cmt.screenshot(id + ".png") |
Thank you for this! Huge insight: scraping the comments is significantly easier when logging in to reddit in the driver.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
`Traceback (most recent call last):
File "C:\Users\achil\AppData\Local\Programs\Python\Python310\lib\configparser.py", line 847, in items
d.update(self._sections[section])
KeyError: Ellipsis
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "c:\Users\achil\Bureau\tamerelapute.py", line 8, in
r = praw.Reddit(...)
File "C:\Users\achil\AppData\Local\Programs\Python\Python310\lib\site-packages\praw\util\deprecate_args.py", line 43, in wrapped
return func(**dict(zip(_old_args, args)), **kwargs)
File "C:\Users\achil\AppData\Local\Programs\Python\Python310\lib\site-packages\praw\reddit.py", line 236, in init
self.config = Config(
File "C:\Users\achil\AppData\Local\Programs\Python\Python310\lib\site-packages\praw\config.py", line 84, in init
self.custom = dict(Config.CONFIG.items(site_name), **settings)
File "C:\Users\achil\AppData\Local\Programs\Python\Python310\lib\configparser.py", line 850, in items
raise NoSectionError(section)
configparser.NoSectionError: No section: Ellipsis
You provided the name of a praw.ini configuration which does not exist.
For help with creating a Reddit instance, visit
https://praw.readthedocs.io/en/latest/code_overview/reddit_instance.html
For help on configuring PRAW, visit
https://praw.readthedocs.io/en/latest/getting_started/configuration.html`