Skip to content

Instantly share code, notes, and snippets.

@WP-LKL
Last active March 13, 2021 17:19
Show Gist options
  • Save WP-LKL/509495dd3272507c0706792d1d6e3c0e to your computer and use it in GitHub Desktop.
Save WP-LKL/509495dd3272507c0706792d1d6e3c0e to your computer and use it in GitHub Desktop.
Scraping dynamic scrollable sites like Pinterest with selenium and bs4. //Please consult website TOS/robots.txt prior to use.
from selenium import webdriver
from bs4 import BeautifulSoup
import io
import time
# Any infinity scroll URL
var = "machinelearning"
url = "https://pinterest.com/search/pins/?q=" + var
ScrollNumber = 10 # The depth we wish to load
sleepTimer = 1 # Waiting 1 second for page to load
# Circumnavigate Bluetooth bug
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging"])
driver = webdriver.Chrome(options=options) # path=r'to/chromedriver.exe'
driver.get(url)
for _ in range(1,ScrollNumber):
driver.execute_script("window.scrollTo(1,100000)")
time.sleep(sleepTimer)
soup = BeautifulSoup(driver.page_source,'html.parser')
with io.open(f'{var}-posts.txt', 'w', encoding="utf-8") as out_file:
for link in soup.find_all('img'):
print(link.get('src'), file=out_file) # All image URLs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment