Skip to content

Instantly share code, notes, and snippets.

@wkarney
Last active May 5, 2023 17:04
Show Gist options
  • Save wkarney/11e5f1beeb85f7670f1a077115c681e2 to your computer and use it in GitHub Desktop.
Save wkarney/11e5f1beeb85f7670f1a077115c681e2 to your computer and use it in GitHub Desktop.
[Infinite Scroll Pages] Scraping infinite scroll pages with selenium #selenium #webscraping #python
from time import sleep
from selenium import webdriver
from bs4 import BeautifulSoup
# Headless/incognito Chrome driver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--incognito")
chrome_options.add_argument('headless')
driver = webdriver.Chrome(executable_path='CHROMEDRIVER_PATH',chrome_options=chrome_options)
driver.get('http://www.exampleurl.com')
# Set sleep time for the page to load on scroll
SCROLL_PAUSE_TIME = 2
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
# If you want to limit the number of scroll loads, add a limit here
scroll_limit = 5
count = 0
while True and count < scroll_limit:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
count += 1
sleep(2)
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
@junguler
Copy link

this helped me a lot, thank you

@wkarney
Copy link
Author

wkarney commented Nov 16, 2022

@junguler glad the script helped 👍

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment